def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): log.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): log.debug('Process file as excel 2003 (xls)') return process_xls(filepath) else: log.debug('Process file as word 2003 (doc)') return process_doc(filepath) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: log.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: log.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: log.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) elif doctype is None: log.debug('Process file as csv') return process_csv(filepath) else: # could be docx; if not: this is the old default code path log.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def test_all_rough(self): """Checks all samples, expect either ole files or good ooxml output""" acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ ooxml.DOCTYPE_POWERPOINT # files that are neither OLE nor xml: except_files = 'empty', 'text' except_extns = '.xml', '.rtf', '.csv' # analyse all files in data dir for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: if self.DO_DEBUG: print('skip file: ' + filename) continue if splitext(filename)[1] in except_extns: if self.DO_DEBUG: print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): if self.DO_DEBUG: print('skip ole: ' + filename) continue try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue(doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable' .format(doctype, full_name)) if self.DO_DEBUG: print('ok: {0} --> {1}'.format(filename, doctype))
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: # This is a RTF file return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) except Exception: log.debug('Exception trying to xml-parse file', exc_info=True) doctype = None if doctype: log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode)
def test_all_rough(self): """Checks all samples, expect either ole files or good ooxml output""" acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ ooxml.DOCTYPE_POWERPOINT except_files = 'empty', 'text' except_extns = '.xml', '.rtf' for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: #print('skip file: ' + filename) continue if splitext(filename)[1] in except_extns: #print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): #print('skip ole: ' + filename) continue try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue(doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable' .format(doctype, full_name))
def test_rough_doctype(self): """Checks all samples, expect either ole files or good ooxml output""" # map from extension to expected doctype ext2doc = dict( docx=ooxml.DOCTYPE_WORD, docm=ooxml.DOCTYPE_WORD, dotx=ooxml.DOCTYPE_WORD, dotm=ooxml.DOCTYPE_WORD, xml=(ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_WORD_XML), xlsx=ooxml.DOCTYPE_EXCEL, xlsm=ooxml.DOCTYPE_EXCEL, xlsb=ooxml.DOCTYPE_EXCEL, xlam=ooxml.DOCTYPE_EXCEL, xltx=ooxml.DOCTYPE_EXCEL, xltm=ooxml.DOCTYPE_EXCEL, pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT, ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT, potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT, ) # files that are neither OLE nor xml: except_files = 'empty', 'text' except_extns = 'rtf', 'csv' # analyse all files in data dir for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: if self.DO_DEBUG: print('skip file: ' + filename) continue extn = splitext(filename)[1] if extn: extn = extn[1:] # remove the dot if extn in except_extns: if self.DO_DEBUG: print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): if self.DO_DEBUG: print('skip ole: ' + filename) continue acceptable = ext2doc[extn] if not isinstance(acceptable, tuple): acceptable = (acceptable, ) try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue( doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable'.format( doctype, full_name)) if self.DO_DEBUG: print('ok: {0} --> {1}'.format(filename, doctype))
def test_rough_doctype(self): """Checks all samples, expect either ole files or good ooxml output""" # map from extension to expected doctype ext2doc = dict( docx=ooxml.DOCTYPE_WORD, docm=ooxml.DOCTYPE_WORD, dotx=ooxml.DOCTYPE_WORD, dotm=ooxml.DOCTYPE_WORD, xml=(ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_WORD_XML), xlsx=ooxml.DOCTYPE_EXCEL, xlsm=ooxml.DOCTYPE_EXCEL, xlsb=ooxml.DOCTYPE_EXCEL, xlam=ooxml.DOCTYPE_EXCEL, xltx=ooxml.DOCTYPE_EXCEL, xltm=ooxml.DOCTYPE_EXCEL, pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT, ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT, potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT, ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE, odp=ooxml.DOCTYPE_NONE, ) # files that are neither OLE nor xml: except_files = 'empty', 'text' except_extns = 'rtf', 'csv' # analyse all files in data dir for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: if self.DO_DEBUG: print('skip file: ' + filename) continue extn = splitext(filename)[1] if extn: extn = extn[1:] # remove the dot if extn in except_extns: if self.DO_DEBUG: print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): if self.DO_DEBUG: print('skip ole: ' + filename) continue acceptable = ext2doc[extn] if not isinstance(acceptable, tuple): acceptable = (acceptable, ) try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue(doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable' .format(doctype, full_name)) if self.DO_DEBUG: print('ok: {0} --> {1}'.format(filename, doctype))
def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): logger.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): logger.debug('Process file as excel 2003 (xls)') return process_xls(filepath) # encrypted files also look like ole, even if office 2007+ (xml-based) # so check for encryption, first ole = olefile.OleFileIO(filepath, path_encoding=None) oid = oleid.OleID(ole) if oid.check_encrypted().value: log.debug('is encrypted - raise error') raise FileIsEncryptedError(filepath) elif oid.check_powerpoint().value: log.debug('is ppt - cannot have DDE') return u'' else: logger.debug('Process file as word 2003 (doc)') return process_doc(ole) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: logger.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: logger.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug('Process file as xml from excel 2003/2007+') return process_excel_xml(filepath) elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug('Process file as xml from word 2003/2007+') return process_docx(filepath) elif doctype is None: logger.debug('Process file as csv') return process_csv(filepath) else: # could be docx; if not: this is the old default code path logger.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode) except Exception: return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): logger.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): logger.debug('Process file as excel 2003 (xls)') return process_xls(filepath) ole = olefile.OleFileIO(filepath, path_encoding=None) if is_ppt(ole): logger.debug('is ppt - cannot have DDE') return u'' logger.debug('Process file as word 2003 (doc)') return process_doc(ole) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: logger.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: logger.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug('Process file as xml from excel 2003/2007+') return process_excel_xml(filepath) if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug('Process file as xml from word 2003/2007+') return process_docx(filepath) if doctype is None: logger.debug('Process file as csv') return process_csv(filepath) # could be docx; if not: this is the old default code path logger.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """decides which of the process_* functions to call""" if olefile.isOleFile(filepath): logger.debug("Is OLE. Checking streams to see whether this is xls") if xls_parser.is_xls(filepath): logger.debug("Process file as excel 2003 (xls)") return process_xls(filepath) if is_ppt(filepath): logger.debug("is ppt - cannot have DDE") return u"" logger.debug("Process file as word 2003 (doc)") with olefile.OleFileIO(filepath, path_encoding=None) as ole: return process_doc(ole) with open(filepath, "rb") as file_handle: # TODO: here we should not assume this is a file on disk, filepath can be a file object if file_handle.read(4) == RTF_START: logger.debug("Process file as rtf") return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug("Detected file type: {0}".format(doctype)) except Exception as exc: logger.debug("Exception trying to xml-parse file: {0}".format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug("Process file as excel 2007+ (xlsx)") return process_xlsx(filepath) if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug("Process file as xml from excel 2003/2007+") return process_excel_xml(filepath) if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug("Process file as xml from word 2003/2007+") return process_docx(filepath) if doctype is None: logger.debug("Process file as csv") return process_csv(filepath) # could be docx; if not: this is the old default code path logger.debug("Process file as word 2007+ (docx)") return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) elif open(filepath, 'rb').read(4) == b'{\\rt': # This is a RTF file return process_rtf(filepath, field_filter_mode) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode) except Exception: log.debug('Exception trying to xml-parse file', exc_info=True) return process_docx(filepath, field_filter_mode)
def run(self): super(Office, self).run() if self.args is None: return if not __sessions__.is_set(): self.log('error', "No open session. This command expects a file to be open.") return if not HAVE_OLE: self.log('error', "Missing dependency, install OleFileIO (`pip install olefile oletools`)") return file_data = __sessions__.current.file.data if file_data.startswith(b'<?xml'): OLD_XML = file_data else: OLD_XML = False if file_data.startswith(b'MIME-Version:') and 'application/x-mso' in file_data: MHT_FILE = file_data else: MHT_FILE = False # Check for old office formats try: doctype = ooxml.get_type(__sessions__.current.file.path) OOXML_FILE = True except Exception: OOXML_FILE = False # set defaults XLSX_FILE = False EXCEL_XML_FILE = False DOCX_FILE = False if OOXML_FILE is True: if doctype == ooxml.DOCTYPE_EXCEL: XLSX_FILE = True elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): EXCEL_XML_FILE = True elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): DOCX_FILE = True # Tests to check for valid Office structures. OLE_FILE = olefile.isOleFile(__sessions__.current.file.path) XML_FILE = zipfile.is_zipfile(__sessions__.current.file.path) if OLE_FILE: ole = olefile.OleFileIO(__sessions__.current.file.path) elif XML_FILE: zip_xml = zipfile.ZipFile(__sessions__.current.file.path, 'r') elif OLD_XML: pass elif MHT_FILE: pass elif DOCX_FILE: pass elif EXCEL_XML_FILE: pass elif XLSX_FILE: pass else: self.log('error', "Not a valid office document") return if self.args.export is not None: if OLE_FILE: self.export(ole, self.args.export) elif XML_FILE: self.xml_export(zip_xml, self.args.export) elif self.args.meta: if OLE_FILE: self.metadata(ole) elif XML_FILE: self.xmlmeta(zip_xml) elif self.args.streams: if OLE_FILE: self.metatimes(ole) elif XML_FILE: self.xmlstruct(zip_xml) elif self.args.oleid: if OLE_FILE: self.oleid(ole) else: self.log('error', "Not an OLE file") elif self.args.vba or self.args.code: self.parse_vba(self.args.code) elif self.args.dde: self.get_dde(__sessions__.current.file.path) else: self.log('error', 'At least one of the parameters is required') self.usage()
def run(self): super(Office, self).run() if self.args is None: return if not __sessions__.is_set(): self.log( 'error', "No open session. This command expects a file to be open.") return if not HAVE_OLE: self.log( 'error', "Missing dependency, install OleFileIO (`pip install olefile oletools`)" ) return file_data = __sessions__.current.file.data if file_data.startswith(b'<?xml'): OLD_XML = file_data else: OLD_XML = False if file_data.startswith( b'MIME-Version:') and 'application/x-mso' in file_data: MHT_FILE = file_data else: MHT_FILE = False # Check for old office formats try: doctype = ooxml.get_type(__sessions__.current.file.path) OOXML_FILE = True except Exception: OOXML_FILE = False # set defaults XLSX_FILE = False EXCEL_XML_FILE = False DOCX_FILE = False if OOXML_FILE is True: if doctype == ooxml.DOCTYPE_EXCEL: XLSX_FILE = True elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): EXCEL_XML_FILE = True elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): DOCX_FILE = True # Tests to check for valid Office structures. OLE_FILE = olefile.isOleFile(__sessions__.current.file.path) XML_FILE = zipfile.is_zipfile(__sessions__.current.file.path) if OLE_FILE: ole = olefile.OleFileIO(__sessions__.current.file.path) elif XML_FILE: zip_xml = zipfile.ZipFile(__sessions__.current.file.path, 'r') elif OLD_XML: pass elif MHT_FILE: pass elif DOCX_FILE: pass elif EXCEL_XML_FILE: pass elif XLSX_FILE: pass else: self.log('error', "Not a valid office document") return if self.args.export is not None: if OLE_FILE: self.export(ole, self.args.export) elif XML_FILE: self.xml_export(zip_xml, self.args.export) elif self.args.meta: if OLE_FILE: self.metadata(ole) elif XML_FILE: self.xmlmeta(zip_xml) elif self.args.streams: if OLE_FILE: self.metatimes(ole) elif XML_FILE: self.xmlstruct(zip_xml) elif self.args.oleid: if OLE_FILE: self.oleid(ole) else: self.log('error', "Not an OLE file") elif self.args.vba or self.args.code: self.parse_vba(self.args.code) elif self.args.dde: self.get_dde(__sessions__.current.file.path) else: self.log('error', 'At least one of the parameters is required') self.usage()