def test_is_ppt(self): """ test ppt_record_parser.is_ppt(filename) """ exceptions = ['encrypted.ppt', ] # actually is ppt but embedded for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in exceptions: continue full_name = join(base_dir, filename) extn = splitext(filename)[1] if extn in ('.ppt', '.pps', '.pot'): self.assertTrue(ppt_record_parser.is_ppt(full_name), msg='{0} not recognized as ppt file' .format(full_name)) else: self.assertFalse(ppt_record_parser.is_ppt(full_name), msg='{0} erroneously recognized as ppt' .format(full_name))
def test_is_ppt(self): """ test ppt_record_parser.is_ppt(filename) """ exceptions = [] for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in exceptions: continue full_name = join(base_dir, filename) extn = splitext(filename)[1] if extn in ('.ppt', '.pps', '.pot'): self.assertTrue( ppt_record_parser.is_ppt(full_name), msg='{0} not recognized as ppt file'.format(full_name)) else: self.assertFalse( ppt_record_parser.is_ppt(full_name), msg='{0} erroneously recognized as ppt'.format( full_name))
def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): logger.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): logger.debug('Process file as excel 2003 (xls)') return process_xls(filepath) ole = olefile.OleFileIO(filepath, path_encoding=None) if is_ppt(ole): logger.debug('is ppt - cannot have DDE') return u'' logger.debug('Process file as word 2003 (doc)') return process_doc(ole) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: logger.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: logger.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug('Process file as xml from excel 2003/2007+') return process_excel_xml(filepath) if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug('Process file as xml from word 2003/2007+') return process_docx(filepath) if doctype is None: logger.debug('Process file as csv') return process_csv(filepath) # could be docx; if not: this is the old default code path logger.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """decides which of the process_* functions to call""" if olefile.isOleFile(filepath): logger.debug("Is OLE. Checking streams to see whether this is xls") if xls_parser.is_xls(filepath): logger.debug("Process file as excel 2003 (xls)") return process_xls(filepath) if is_ppt(filepath): logger.debug("is ppt - cannot have DDE") return u"" logger.debug("Process file as word 2003 (doc)") with olefile.OleFileIO(filepath, path_encoding=None) as ole: return process_doc(ole) with open(filepath, "rb") as file_handle: # TODO: here we should not assume this is a file on disk, filepath can be a file object if file_handle.read(4) == RTF_START: logger.debug("Process file as rtf") return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug("Detected file type: {0}".format(doctype)) except Exception as exc: logger.debug("Exception trying to xml-parse file: {0}".format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug("Process file as excel 2007+ (xlsx)") return process_xlsx(filepath) if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug("Process file as xml from excel 2003/2007+") return process_excel_xml(filepath) if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug("Process file as xml from word 2003/2007+") return process_docx(filepath) if doctype is None: logger.debug("Process file as csv") return process_csv(filepath) # could be docx; if not: this is the old default code path logger.debug("Process file as word 2007+ (docx)") return process_docx(filepath, field_filter_mode)
def find_ole(filename, data, xml_parser=None): """ try to open somehow as zip/ole/rtf/... ; yield None if fail If data is given, filename is (mostly) ignored. yields embedded ole streams in form of OleFileIO. """ if data is not None: # isOleFile and is_ppt can work on data directly but zip need file # --> wrap data in a file-like object without copying data log.debug('working on data, file is not touched below') arg_for_ole = data arg_for_zip = FakeFile(data) else: # we only have a file name log.debug('working on file by name') arg_for_ole = filename arg_for_zip = filename ole = None try: if olefile.isOleFile(arg_for_ole): if is_ppt(arg_for_ole): log.info('is ppt file: ' + filename) for ole in find_ole_in_ppt(arg_for_ole): yield ole ole = None # is closed in find_ole_in_ppt # in any case: check for embedded stuff in non-sectored streams log.info('is ole file: ' + filename) ole = olefile.OleFileIO(arg_for_ole) yield ole elif xml_parser is not None or is_zipfile(arg_for_zip): # keep compatibility with 3rd-party code that calls this function # directly without providing an XmlParser instance if xml_parser is None: xml_parser = XmlParser(arg_for_zip) # force iteration so XmlParser.iter_non_xml() returns data [x for x in xml_parser.iter_xml()] log.info('is zip file: ' + filename) # we looped through the XML files before, now we can # iterate the non-XML files looking for ole objects for subfile, _, file_handle in xml_parser.iter_non_xml(): try: head = file_handle.read(len(olefile.MAGIC)) except RuntimeError: log.error('zip is encrypted: ' + filename) yield None continue if head == olefile.MAGIC: file_handle.seek(0) log.info(' unzipping ole: ' + subfile) try: ole = olefile.OleFileIO(file_handle) yield ole except IOError: log.warning('Error reading data from {0}/{1} or ' 'interpreting it as OLE object'.format( filename, subfile)) log.debug('', exc_info=True) finally: if ole is not None: ole.close() ole = None else: log.debug('unzip skip: ' + subfile) else: log.warning( 'open failed: {0} (or its data) is neither zip nor OLE'.format( filename)) yield None except Exception: log.error('Caught exception opening {0}'.format(filename), exc_info=True) yield None finally: if ole is not None: ole.close()
def find_ole(filename, data): """ try to open somehow as zip/ole/rtf/... ; yield None if fail If data is given, filename is (mostly) ignored. yields embedded ole streams in form of OleFileIO. """ if data is not None: # isOleFile and is_ppt can work on data directly but zip need file # --> wrap data in a file-like object without copying data log.debug('working on data, file is not touched below') arg_for_ole = data arg_for_zip = FakeFile(data) else: # we only have a file name log.debug('working on file by name') arg_for_ole = filename arg_for_zip = filename ole = None try: if olefile.isOleFile(arg_for_ole): if is_ppt(arg_for_ole): log.info('is ppt file: ' + filename) for ole in find_ole_in_ppt(arg_for_ole): yield ole ole = None # is closed in find_ole_in_ppt # in any case: check for embedded stuff in non-sectored streams log.info('is ole file: ' + filename) ole = olefile.OleFileIO(arg_for_ole) yield ole elif is_zipfile(arg_for_zip): log.info('is zip file: ' + filename) zipper = ZipFile(arg_for_zip, 'r') for subfile in zipper.namelist(): head = b'' try: with zipper.open(subfile) as file_handle: head = file_handle.read(len(olefile.MAGIC)) except RuntimeError: log.error('zip is encrypted: ' + filename) yield None continue if head == olefile.MAGIC: log.info(' unzipping ole: ' + subfile) with ZipSubFile(zipper, subfile) as file_handle: try: ole = olefile.OleFileIO(file_handle) yield ole except IOError: log.warning('Error reading data from {0}/{1} or ' 'interpreting it as OLE object'.format( filename, subfile)) log.debug('', exc_info=True) finally: if ole is not None: ole.close() ole = None else: log.debug('unzip skip: ' + subfile) else: log.warning( 'open failed: {0} (or its data) is neither zip nor OLE'.format( filename)) yield None except Exception: log.error('Caught exception opening {0}'.format(filename), exc_info=True) yield None finally: if ole is not None: ole.close()
def find_ole(filename, data, xml_parser=None): """ try to open somehow as zip/ole/rtf/... ; yield None if fail If data is given, filename is (mostly) ignored. yields embedded ole streams in form of OleFileIO. """ if data is not None: # isOleFile and is_ppt can work on data directly but zip need file # --> wrap data in a file-like object without copying data log.debug('working on data, file is not touched below') arg_for_ole = data arg_for_zip = FakeFile(data) else: # we only have a file name log.debug('working on file by name') arg_for_ole = filename arg_for_zip = filename ole = None try: if olefile.isOleFile(arg_for_ole): if is_ppt(arg_for_ole): log.info('is ppt file: ' + filename) for ole in find_ole_in_ppt(arg_for_ole): yield ole ole = None # is closed in find_ole_in_ppt # in any case: check for embedded stuff in non-sectored streams log.info('is ole file: ' + filename) ole = olefile.OleFileIO(arg_for_ole) yield ole elif xml_parser is not None or is_zipfile(arg_for_zip): # keep compatibility with 3rd-party code that calls this function # directly without providing an XmlParser instance if xml_parser is None: xml_parser = XmlParser(arg_for_zip) # force iteration so XmlParser.iter_non_xml() returns data [x for x in xml_parser.iter_xml()] log.info('is zip file: ' + filename) # we looped through the XML files before, now we can # iterate the non-XML files looking for ole objects for subfile, _, file_handle in xml_parser.iter_non_xml(): try: head = file_handle.read(len(olefile.MAGIC)) except RuntimeError: log.error('zip is encrypted: ' + filename) yield None continue if head == olefile.MAGIC: file_handle.seek(0) log.info(' unzipping ole: ' + subfile) try: ole = olefile.OleFileIO(file_handle) yield ole except IOError: log.warning('Error reading data from {0}/{1} or ' 'interpreting it as OLE object' .format(filename, subfile)) log.debug('', exc_info=True) finally: if ole is not None: ole.close() ole = None else: log.debug('unzip skip: ' + subfile) else: log.warning('open failed: {0} (or its data) is neither zip nor OLE' .format(filename)) yield None except Exception: log.error('Caught exception opening {0}'.format(filename), exc_info=True) yield None finally: if ole is not None: ole.close()