def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): log.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): log.debug('Process file as excel 2003 (xls)') return process_xls(filepath) else: log.debug('Process file as word 2003 (doc)') return process_doc(filepath) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: log.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: log.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: log.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) elif doctype is None: log.debug('Process file as csv') return process_csv(filepath) else: # could be docx; if not: this is the old default code path log.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: # This is a RTF file return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) except Exception: log.debug('Exception trying to xml-parse file', exc_info=True) doctype = None if doctype: log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): logger.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): logger.debug('Process file as excel 2003 (xls)') return process_xls(filepath) # encrypted files also look like ole, even if office 2007+ (xml-based) # so check for encryption, first ole = olefile.OleFileIO(filepath, path_encoding=None) oid = oleid.OleID(ole) if oid.check_encrypted().value: log.debug('is encrypted - raise error') raise FileIsEncryptedError(filepath) elif oid.check_powerpoint().value: log.debug('is ppt - cannot have DDE') return u'' else: logger.debug('Process file as word 2003 (doc)') return process_doc(ole) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: logger.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: logger.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug('Process file as xml from excel 2003/2007+') return process_excel_xml(filepath) elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug('Process file as xml from word 2003/2007+') return process_docx(filepath) elif doctype is None: logger.debug('Process file as csv') return process_csv(filepath) else: # could be docx; if not: this is the old default code path logger.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode) except Exception: return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): logger.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): logger.debug('Process file as excel 2003 (xls)') return process_xls(filepath) ole = olefile.OleFileIO(filepath, path_encoding=None) if is_ppt(ole): logger.debug('is ppt - cannot have DDE') return u'' logger.debug('Process file as word 2003 (doc)') return process_doc(ole) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: logger.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: logger.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug('Process file as xml from excel 2003/2007+') return process_excel_xml(filepath) if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug('Process file as xml from word 2003/2007+') return process_docx(filepath) if doctype is None: logger.debug('Process file as csv') return process_csv(filepath) # could be docx; if not: this is the old default code path logger.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """decides which of the process_* functions to call""" if olefile.isOleFile(filepath): logger.debug("Is OLE. Checking streams to see whether this is xls") if xls_parser.is_xls(filepath): logger.debug("Process file as excel 2003 (xls)") return process_xls(filepath) if is_ppt(filepath): logger.debug("is ppt - cannot have DDE") return u"" logger.debug("Process file as word 2003 (doc)") with olefile.OleFileIO(filepath, path_encoding=None) as ole: return process_doc(ole) with open(filepath, "rb") as file_handle: # TODO: here we should not assume this is a file on disk, filepath can be a file object if file_handle.read(4) == RTF_START: logger.debug("Process file as rtf") return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) logger.debug("Detected file type: {0}".format(doctype)) except Exception as exc: logger.debug("Exception trying to xml-parse file: {0}".format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: logger.debug("Process file as excel 2007+ (xlsx)") return process_xlsx(filepath) if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003): logger.debug("Process file as xml from excel 2003/2007+") return process_excel_xml(filepath) if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003): logger.debug("Process file as xml from word 2003/2007+") return process_docx(filepath) if doctype is None: logger.debug("Process file as csv") return process_csv(filepath) # could be docx; if not: this is the old default code path logger.debug("Process file as word 2007+ (docx)") return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) elif open(filepath, 'rb').read(4) == b'{\\rt': # This is a RTF file return process_rtf(filepath, field_filter_mode) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode) except Exception: log.debug('Exception trying to xml-parse file', exc_info=True) return process_docx(filepath, field_filter_mode)