def convert(self, txrm_file, custom_reference=None, ignore_reference=False, annotate=False): with OleFileIO(str(txrm_file)) as ole: images = txrm_wrapper.extract_all_images(ole) reference = _get_reference(ole, txrm_file.name, custom_reference, ignore_reference) if reference is not None: self.image_output = _apply_reference(images, reference) else: self.image_output = np.around(images) if (len(self.image_output) > 1 and ole.exists("ImageInfo/MosiacRows") and ole.exists("ImageInfo/MosiacColumns")): mosaic_rows = txrm_wrapper.read_imageinfo_as_int( ole, "MosiacRows") mosaic_cols = txrm_wrapper.read_imageinfo_as_int( ole, "MosiacColumns") if mosaic_rows != 0 and mosaic_cols != 0: # Version 13 style mosaic: self.image_output = _stitch_images( self.image_output, (mosaic_cols, mosaic_rows), 1) if annotate: # Extract annotations annotator = Annotator(self.image_output[0].shape[::-1]) if annotator.extract_annotations( ole): # True if any annotations were drawn self.annotator = annotator else: self.annotator = False # Create metadata self.ome_metadata = create_ome_metadata(ole, self.image_output)
def _get_reference(ole, txrm_name, custom_reference, ignore_reference): if custom_reference is not None: logging.info("%s is being processed with file %s as a reference.", txrm_name, custom_reference.name) reference_path = str(custom_reference) try: if isOleFile(reference_path): with OleFileIO(reference_path) as ref_ole: references = txrm_wrapper.extract_all_images( ref_ole) # should be float for averaging & dividing elif ".tif" in reference_path: with tf.TiffFile(reference_path) as tif: references = np.asarray(tif.pages[:]) else: msg = f"Unable to open file '{reference_path}'. Only tif/tiff or xrm/txrm files are supported for custom references." logging.error(msg) raise IOError(msg) except: logging.error("Error occurred reading custom reference", exc_info=True) raise if len(references) > 1: # if reference file is an image stack take median of the images return _dynamic_despeckle_and_average_series(references) return references[0] elif ole.exists("ReferenceData/Image") and not ignore_reference: logging.info("Internal reference will be applied to %s", txrm_name) return txrm_wrapper.extract_reference_image(ole) logging.debug("%s is being processed without a reference.", txrm_name) return None
def extract_ole_metadata(self, file_path): with open(file_path, 'r') as fh: if not isOleFile(fh): return fh.seek(0) ole = OleFileIO(fh) self.extract_olefileio_metadata(ole)
def checkVersion(file): vrs = None filename = os.path.abspath(file) ole = OleFileIO(filename) elements = ole.listdir(streams=True, storages=False) for e in elements: if (e[-1] == 'RSeDb'): data = ole.openstream(e).read() version, i = getVersionInfo(data, 20) if (version.major >= 14): setDumpFolder(file) return ole break if (version): vrsName = version.major if (version.major >= 11): vrsName += 1996 QMessageBox.critical( FreeCAD.ActiveDocument, 'FreeCAD: Inventor workbench...', 'Can\'t load file created with Inventor v%d' % (vrsName)) logError('Can\'t load file created with Inventor v%d' % (vrsName)) else: QMessageBox.critical( FreeCAD.ActiveDocument, 'FreeCAD: Inventor workbench...', 'Can\'t determine Inventor version file was created with') logError('Can\'t determine Inventor version file was created with!') return None
def is_encrypted(some_file): """ Determine whether document contains encrypted content. This should return False for documents that are just write-protected or signed or finalized. It should return True if ANY content of the file is encrypted and can therefore not be analyzed by other oletools modules without given a password. Exception: there are way to write-protect an office document by embedding it as encrypted stream with hard-coded standard password into an otherwise empty OLE file. From an office user point of view, this is no encryption, but regarding file structure this is encryption, so we return `True` for these. This should not raise exceptions needlessly. This implementation is rather simple: it returns True if the file contains streams with typical encryption names (c.f. [MS-OFFCRYPTO]). It does not test whether these streams actually contain data or whether the ole file structure contains the necessary references to these. It also checks the "well-known property" PIDSI_DOC_SECURITY if the SummaryInformation stream is accessible (c.f. [MS-OLEPS] 2.25.1) :param some_file: File name or an opened OleFileIO :type some_file: :py:class:`olefile.OleFileIO` or `str` :returns: True if (and only if) the file contains encrypted content """ log.debug('is_encrypted') if isinstance(some_file, OleFileIO): return is_encrypted_ole(some_file) # assume it is OleFileIO if zipfile.is_zipfile(some_file): return is_encrypted_zip(some_file) # otherwise assume it is the name of an ole file return is_encrypted_ole(OleFileIO(some_file))
def __init__(self, data: bytes): self.oid: Optional[oletools.oleid.OleID] = None if isOleFile(data): ole_file = OleFileIO(data) self.oid = oletools.oleid.OleID(ole_file) self.oid.check()
def setInventorFile(file): global _inventor_file global _dump_folder _inventor_file = os.path.abspath(file) setDumpFolder(_inventor_file) return OleFileIO(file)
def __init__(self, msg_file_path): self.msg_file_path = msg_file_path self.include_attachment_data = False if not self.is_valid_msg_file(): raise Exception( "Invalid file provided, please provide valid Microsoft’s Outlook MSG file." ) with OleFileIO(msg_file_path) as ole_file: # process directory entries ole_root = ole_file.root kids_dict = ole_root.kids_dict self._message = Message(kids_dict) self._message_dict = self._message.as_dict() # process msg properties self._set_properties() # process msg recipients self._set_recipients() # process attachments self._set_attachments()
def __init__(self, olefile, path='', parent=None): if not hasattr(olefile, 'openstream'): isOleFile = import_isOleFile() OleFileIO = import_OleFileIO() if not isOleFile(olefile): errormsg = 'Not an OLE2 Compound Binary File.' raise InvalidOleStorageError(errormsg) olefile = OleFileIO(olefile) OleStorageItem.__init__(self, olefile, path, parent)
def get_general(self, data, f): ''' Extract general info ''' for k, v in OleFileIO(f).get_metadata().__dict__.items(): if v != None: if type(v) == bytes: if len(v) > 0: data.update({k: v.decode("utf-8", errors="ignore")}) else: data.update({k: v})
def __init__(self, filename): self.OleFile = OleFileIO(filename) # Components self.Components = self.parseComponents( self.readStream("Components6/Data")) manifest = getU32(self.readStream("Components6/Header")) counted = len(self.Components) if manifest != counted: print "Warning: Header disagrees about component count, says there are " + str( manifest) + ", but we counted " + str(counted) + "."
def get_general(self, data, temp_f): ''' Extract general info ''' for temp_k, temp_v in OleFileIO( temp_f).get_metadata().__dict__.items(): if temp_v is not None: if isinstance(temp_v, bytes): if len(temp_v) > 0: data.update( {temp_k: temp_v.decode("utf-8", errors="ignore")}) else: data.update({temp_k: temp_v})
def extract_ole_metadata(self, file_path, entity): with open(file_path, 'rb') as fh: if not isOleFile(fh): return fh.seek(0) try: ole = OleFileIO(fh) self.extract_olefileio_metadata(ole, entity) except (RuntimeError, IOError): # OLE reading can go fully recursive, at which point it's OK # to just eat this runtime error quietly. log.warning("Failed to read OLE data: %r", entity) except Exception: log.exception("Failed to read OLE data: %r", entity)
def get_streams(self,dump) -> (list,list): ''' get streams ''' _Listobjects = [] _List = [] ole = OleFileIO(dump) listdir = ole.listdir() for direntry in listdir: dirs = re.sub(r'[^\x20-\x7f]',r'', " : ".join(direntry)) tempdecoded = sub(br'[^\x20-\x7F]+',b'', ole.openstream(direntry).getvalue()) _Listobjects.append(tempdecoded) _List.append({"Name":dirs,"Parsed":tempdecoded.decode("utf-8",errors="ignore")}) return _List,_Listobjects
def ReadFile(doc, readProperties): first = 0 list = {} counters = {} # LOG.LOG_FILTER = LOG.LOG_FILTER | LOG.LOG_DEBUG if (isOleFile(getInventorFile())): ole = OleFileIO(getInventorFile()) setFileVersion(ole) elements = ole.listdir(streams=True, storages=False) folder = getInventorFile()[0:-4] if not os.path.exists(folder): os.makedirs(folder) counter = 1 list = [] for fname in elements: if (len(fname) == 1): list.append(fname) else: #Ensure that RSe* files will be parsed first if (fname[-1].startswith('RSe')): #ensure RSeDb is the very first "file" to be parsed list.insert(first, fname) if (fname[-1] == 'RSeDb'): first += 1 elif (not fname[-1].startswith('B')): list.append(fname) for fname in list: ReadElement(ole, fname, doc, counter, readProperties) counter += 1 ole.close() now = datetime.datetime.now() if (len(doc.Comment) > 0): doc.Comment += '\n' doc.Comment = '# %s: read from %s' % ( now.strftime('%Y-%m-%d %H:%M:%S'), getInventorFile()) logMessage("Dumped data to folder: '%s'" % (getInventorFile()[0:-4]), LOG.LOG_INFO) return True logError("Error - '%s' is not a valid Autodesk Inventor file." % (infile)) return False
def __init__(self, filename): self.OleFile = OleFileIO(filename) # TOC = Table Of Contents # A list of the footprints contained in this PcbLib can be found here: #self.TOC = TOC( self.readStream("Library/ComponentParamsTOC/Data") ) # not always present # # Parse library parameters # Library/Data contains a list of parameters (string: "|"-separated key-value pairs) # followed by the count and names of footprints in the library # buffer = self.readStream("Library/Data") # Properties print "Library properties:" length = getU32(buffer[:4]) self.Properties = parseKeyValueString(buffer[4:4+length]) print self.Properties # Footprint list cursor = 4+length count = getU32(buffer[cursor:]) cursor += 4 print "Footprints in library: "+str(count) footprints = [] for i in range(count): subrecord = SubRecord(buffer[cursor:]) name = SubRecord_String(subrecord) print " * "+name footprints.append(name) cursor += subrecord.length # Parse all the footprints self.Footprints = [] for footprint in footprints: print "Parsing "+footprint+" ..." self.Footprints.append( Footprint(self.readStream(footprint+"/Data")) ) # Create a dictionary of footprints to access them by name self.FootprintsByName = {} for footprint in self.Footprints: self.FootprintsByName[footprint.name] = footprint
def ole_file_works(path): if (path.suffix == ".txrm") or (path.suffix == ".xrm"): if isOleFile(str(path)): with OleFileIO(str(path)) as ole_file: number_frames_taken = read_imageinfo_as_int(ole_file, "ImagesTaken") expected_number_frames = read_imageinfo_as_int(ole_file, "NoOfImages") # Returns true even if all frames aren't written, throwing warning. if number_frames_taken != expected_number_frames: logging.warning("%s is an incomplete %s file: only %i out of %i frames have been written", path.name, path.suffix, number_frames_taken, expected_number_frames) # Check for reference frame: if not ole_file.exists("ReferenceData/Image"): logging.warning("No reference data found in file %s", path) return True else: logging.warning("Could not read ole file %s", path) else: logging.warning("%s not .txrm or .xrm", path) return False
def filename_to_lines(filepath): filename = filepath.split('/')[-1] extension = filename.split('.')[-1] if '.' not in filename or extension in ['txt']: return open(filepath, 'r', encoding='utf-8').readlines() if extension in ['hwp']: return OleFileIO(filepath).openstream('PrvText').read().decode( 'utf-16').split('\n') if extension in ['doc', 'docx']: return [p.text for p in docx.Document(filepath).paragraphs] if extension in ['pdf']: return parser.from_file(filepath)['content'].split('\n') if extension in ['jpg', 'png', 'jpeg', 'bmp', 'gif', 'tiff', 'jfif']: easyocr_terms = EASYOCR.readtext(filepath, detail=0) tesseract_terms = image_to_string(Image.open(filepath), lang='kor+eng').split('\n') return easyocr_terms + tesseract_terms # return EASYOCR.readtext(filepath, detail=0) else: raise ValueError('알려지지 않은 확장자')
def oleMetaData(file_path, save=True): now = dt.now() file_name = getFileName(file_path) metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % ( now.year, now.month, now.day, now.hour, now.minute, now.second, file_name[:-4]) try: ole = OleFileIO(file_path) meta = ole.get_metadata() ole.close() author = meta.author.decode("latin-1") creation_time = meta.create_time.ctime() last_author = meta.last_saved_by.decode("latin-1") last_edit_time = meta.last_saved_time.ctime() last_printed = meta.last_printed.ctime() revisions = meta.revision_number.decode("latin-1") company = meta.company.decode("latin-1") creating_app = meta.creating_application.decode("latin-1") metadata += "Original Author: %s\nCreation Time: %s\nLast Author: %s\n" % (author, creation_time, last_author) \ + "Last Modification Time: %s\nLast Printed at: %s\Total Revisions: %s\n" % (last_edit_time, last_printed, revisions) \ + "Created with: %s\nCompany: %s" % (creating_app, company) try: print(metadata) except UnicodeEncodeError: print( "Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script." ) if save: file_name = getFileName(file_path) tgt = file_name + ".txt" saveResult(tgt, metadata) except OSError as e1: print("File not supported: %s" % e1) except FileNotFoundError: print("Specified file could not be found")
def read(self): mylog.info(u'Начинаю чтение %s' % self.filename) self.ole = OleFileIO(self.filename) oledirs = self.ole.listdir() mylog.debug('OLE_DIRS: %s' % oledirs) for entry in oledirs: entry_name = entry[0] mylog.debug(u'entry_name: %s' % entry_name) try: if entry_name == 'Metadata': if "Main MetaData Stream" in entry and self.parse_metadata: self.handler_metadata(entry) if entry_name == 'Document': if "Dialog Stream" in entry and self.parse_dialog: self.handler_dialog(entry) if "Container.Profile" in entry: continue if "Container.Contents" in entry: continue except Exception as e: mylog.exception(u'Ошибка при чтении конфигурации %s' % e.message) return self.read_result
if __name__ == '__main__': if (len(sys.argv) > 1): files = sys.argv[1:] filename = files[0].decode( sys.getfilesystemencoding()) # make it UNICODE! setInventorFile(filename) if (isOleFile(filename)): if (len(files) == 1): open(filename) else: # this is only for debugging purposes... docname = os.path.splitext(os.path.basename(filename))[0] docname = decode(docname, utf=True) doc = FreeCAD.newDocument(docname) ole = OleFileIO(filename) setFileVersion(ole) elements = ole.listdir(streams=True, storages=False) counter = 1 if (files[1] == 'l'): for filename in elements: ListElement(ole, filename, counter) counter += 1 else: list = {} counters = {} for a in (elements): path = PrintableName(a) list['%s' % (counter)] = a counters['%s' % (counter)] = counter
def is_encrypted(some_file): """ Determine whether document contains encrypted content. This should return False for documents that are just write-protected or signed or finalized. It should return True if ANY content of the file is encrypted and can therefore not be analyzed by other oletools modules without given a password. Exception: there are way to write-protect an office document by embedding it as encrypted stream with hard-coded standard password into an otherwise empty OLE file. From an office user point of view, this is no encryption, but regarding file structure this is encryption, so we return `True` for these. This should not raise exceptions needlessly. This implementation is rather simple: it returns True if the file contains streams with typical encryption names (c.f. [MS-OFFCRYPTO]). It does not test whether these streams actually contain data or whether the ole file structure contains the necessary references to these. It also checks the "well-known property" PIDSI_DOC_SECURITY if the SummaryInformation stream is accessible (c.f. [MS-OLEPS] 2.25.1) :param some_file: File name or an opened OleFileIO :type some_file: :py:class:`olefile.OleFileIO` or `str` :returns: True if (and only if) the file contains encrypted content """ log.debug('is_encrypted') # ask msoffcrypto if possible if check_msoffcrypto(): log.debug('Checking for encryption using msoffcrypto') file_handle = None file_pos = None try: if isinstance(some_file, OleFileIO): # TODO: hacky, replace once msoffcrypto-tools accepts OleFileIO file_handle = some_file.fp file_pos = file_handle.tell() file_handle.seek(0) else: file_handle = open(some_file, 'rb') return msoffcrypto.OfficeFile(file_handle).is_encrypted() except Exception as exc: log.warning('msoffcrypto failed to interpret file {} or determine ' 'whether it is encrypted: {}' .format(file_handle.name, exc)) finally: try: if file_pos is not None: # input was OleFileIO file_handle.seek(file_pos) else: # input was file name file_handle.close() except Exception as exc: log.warning('Ignoring error during clean up: {}'.format(exc)) # if that failed, try ourselves with older and less accurate code try: if isinstance(some_file, OleFileIO): return _is_encrypted_ole(some_file) if zipfile.is_zipfile(some_file): return _is_encrypted_zip(some_file) # otherwise assume it is the name of an ole file with OleFileIO(some_file) as ole: return _is_encrypted_ole(ole) except Exception as exc: log.warning('Failed to check {} for encryption ({}); assume it is not ' 'encrypted.'.format(some_file, exc)) return False
def parse_md(filename): mylog.info(u'Начинаю чтение %s' % filename) m = {'dds': []} ole = OleFileIO(filename) # mylog.debug('OLE_DIRS: %s' % ole.listdir()) m['entry'] = {} for entry in ole.listdir(): mylog.debug(entry[0]) #with open("stream_%s" % entry[0],'w+') as f: # f.write(repr(entry)) if entry[0] == 'Document': #print entry if "Dialog Stream" in entry: continue try: sz = ole.get_size(entry) f = ole.openstream(entry) #print f.read(sz) f.close() except Exception as e: mylog.exception(repr(e.args)) if "Container.Profile" in entry: continue try: sz = ole.get_size(entry) f = ole.openstream(entry) #print f.read(sz) f.close() except: mylog.exception() if "Container.Contents" in entry: continue sz = ole.get_size(entry) f = ole.openstream(entry) #print f.read(sz) f.close() if "MD Programm text" in entry: continue ''' Пока что не работает, работало в прежних версиях python try: sz= ole.get_size(entry) f=ole.openstream(entry) tx= f.read(sz) f.close() #print zlib.compress('test').encode('hex') zi=zlib.decompress((zlib_head+tx)) print zi except Exception,e: mylog.exception('read MD Programm text') mylog.info(tx[:10].encode('hex')) #print e ''' if entry[0] == 'Metadata': if "Main MetaData Stream" in entry: try: #sz= ole.get_size(entry) f = ole.openstream(entry) tx = f.read() f.close() #print zlib.compress('test').encode('hex') #d=zlib.decompressobj() #zi=zlib.decompress(zlib_head+tx) #tx_fixed = utils.fixunicode(tx,'cp1251') #mylog.debug(tx.decode('cp1251')) m['dds'] = ParseTree(tx.decode('cp1251', errors='ignore')) except Exception as e: mylog.exception('parse metadata error') #if entry[0] == 'Journal': #write dumps if DUMP_META: if "MD Programm text" in entry: sz = ole.get_size(entry) f = ole.openstream(entry) tx = f.read(sz) f.close() hx = tx.encode('hex') if ztest.find(hx) > 0: #print entry #print hx pass try: #zlib.compress("//test").encode('hex') #'789c d3d72f492d2e0100 0664021f' # d3d72f492d2e0100 tx = zlib.decompress(zlib_head + tx) #print "MODULE:", tx pass except Exception as e: #print "size MD text:", sz, e.message pass dump_stream("entry-%s" % entry, zlib_head + tx) else: dump_stream("entry-%s" % entry, ole.openstream(entry).read()) return m
def from_path(path: Path) -> Dict[str, Any]: with OleFileIO(path) as ole_file_io: return ole_file_io.get_metadata().__dict__.items()
def __init__( self, path, prefix="", ole=None, filename=None, encoding=None, lazy=False, ): """ :param path: path to the msg file in the system or is the raw msg file. :param prefix: used for extracting embeded msg files inside the main one. Do not set manually unless you know what you are doing. :param filename: optional, the filename to be used by default when saving. :param: extract_attachments: extract data from attachments to message :param lazy: continue with extraction even if an attachment fails """ self.path = path self.filename = filename if ole is None: ole = OleFileIO(path) self.ole = ole self.lazy = lazy # Parse the main props self.prefix = prefix prop_type = constants.TYPE_MESSAGE_EMBED if self.prefix == "": prop_type = constants.TYPE_MESSAGE propdata = self._getStream("__properties_version1.0") self.mainProperties = Properties(propdata, prop_type) # Determine if the message is unicode-style: # PidTagStoreSupportMask self.is_unicode = False if "340D0003" in self.mainProperties: value = self.mainProperties["340D0003"].value self.is_unicode = (value & 0x40000) != 0 self.encoding = encoding # if "66C30003" in self.mainProperties: # # PidTagCodePageId # codepage = self.mainProperties["66C30003"].value # self.encoding = get_encoding(codepage, self.encoding) if self.encoding is None: metadata = ole.get_metadata() self.encoding = get_encoding(metadata.codepage) if "3FFD0003" in self.mainProperties: # PidTagMessageCodepage codepage = self.mainProperties["3FFD0003"].value self.encoding = get_encoding(codepage, self.encoding) if self.encoding is None: self.encoding = self.guessEncoding() log.debug("Message encoding: %s", self.encoding) self.subject = self.getStringField("0037") self.date = self.mainProperties.date