Esempio n. 1
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of the process_* functions to call """
    if olefile.isOleFile(filepath):
        log.debug('Is OLE. Checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            log.debug('Process file as excel 2003 (xls)')
            return process_xls(filepath)
        else:
            log.debug('Process file as word 2003 (doc)')
            return process_doc(filepath)

    with open(filepath, 'rb') as file_handle:
        if file_handle.read(4) == RTF_START:
            log.debug('Process file as rtf')
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        log.debug('Detected file type: {0}'.format(doctype))
    except Exception as exc:
        log.debug('Exception trying to xml-parse file: {0}'.format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        log.debug('Process file as excel 2007+ (xlsx)')
        return process_xlsx(filepath)
    elif doctype is None:
        log.debug('Process file as csv')
        return process_csv(filepath)
    else:  # could be docx; if not: this is the old default code path
        log.debug('Process file as word 2007+ (docx)')
        return process_docx(filepath, field_filter_mode)
Esempio n. 2
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of process_doc/x or process_xls/x to call """
    if olefile.isOleFile(filepath):
        log.debug('checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            return process_xls(filepath)
        else:
            return process_doc(filepath)

    with open(filepath, 'rb') as file_handle:
       if file_handle.read(4) == RTF_START:
            # This is a RTF file
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
    except Exception:
        log.debug('Exception trying to xml-parse file', exc_info=True)
        doctype = None

    if doctype:
        log.debug('Detected file type: {0}'.format(doctype))
    if doctype == ooxml.DOCTYPE_EXCEL:
        return process_xlsx(filepath, field_filter_mode)
    else:
        return process_docx(filepath, field_filter_mode)
Esempio n. 3
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of the process_* functions to call """
    if olefile.isOleFile(filepath):
        logger.debug('Is OLE. Checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            logger.debug('Process file as excel 2003 (xls)')
            return process_xls(filepath)

        # encrypted files also look like ole, even if office 2007+ (xml-based)
        # so check for encryption, first
        ole = olefile.OleFileIO(filepath, path_encoding=None)
        oid = oleid.OleID(ole)
        if oid.check_encrypted().value:
            log.debug('is encrypted - raise error')
            raise FileIsEncryptedError(filepath)
        elif oid.check_powerpoint().value:
            log.debug('is ppt - cannot have DDE')
            return u''
        else:
            logger.debug('Process file as word 2003 (doc)')
            return process_doc(ole)

    with open(filepath, 'rb') as file_handle:
        if file_handle.read(4) == RTF_START:
            logger.debug('Process file as rtf')
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        logger.debug('Detected file type: {0}'.format(doctype))
    except Exception as exc:
        logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        logger.debug('Process file as excel 2007+ (xlsx)')
        return process_xlsx(filepath)
    elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
        logger.debug('Process file as xml from excel 2003/2007+')
        return process_excel_xml(filepath)
    elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
        logger.debug('Process file as xml from word 2003/2007+')
        return process_docx(filepath)
    elif doctype is None:
        logger.debug('Process file as csv')
        return process_csv(filepath)
    else:  # could be docx; if not: this is the old default code path
        logger.debug('Process file as word 2007+ (docx)')
        return process_docx(filepath, field_filter_mode)
Esempio n. 4
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of the process_* functions to call """
    if olefile.isOleFile(filepath):
        logger.debug('Is OLE. Checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            logger.debug('Process file as excel 2003 (xls)')
            return process_xls(filepath)

        # encrypted files also look like ole, even if office 2007+ (xml-based)
        # so check for encryption, first
        ole = olefile.OleFileIO(filepath, path_encoding=None)
        oid = oleid.OleID(ole)
        if oid.check_encrypted().value:
            log.debug('is encrypted - raise error')
            raise FileIsEncryptedError(filepath)
        elif oid.check_powerpoint().value:
            log.debug('is ppt - cannot have DDE')
            return u''
        else:
            logger.debug('Process file as word 2003 (doc)')
            return process_doc(ole)

    with open(filepath, 'rb') as file_handle:
        if file_handle.read(4) == RTF_START:
            logger.debug('Process file as rtf')
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        logger.debug('Detected file type: {0}'.format(doctype))
    except Exception as exc:
        logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        logger.debug('Process file as excel 2007+ (xlsx)')
        return process_xlsx(filepath)
    elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
        logger.debug('Process file as xml from excel 2003/2007+')
        return process_excel_xml(filepath)
    elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
        logger.debug('Process file as xml from word 2003/2007+')
        return process_docx(filepath)
    elif doctype is None:
        logger.debug('Process file as csv')
        return process_csv(filepath)
    else:  # could be docx; if not: this is the old default code path
        logger.debug('Process file as word 2007+ (docx)')
        return process_docx(filepath, field_filter_mode)
Esempio n. 5
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of process_doc/x or process_xls/x to call """
    if olefile.isOleFile(filepath):
        log.debug('checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            return process_xls(filepath)
        else:
            return process_doc(filepath)
    try:
        doctype = ooxml.get_type(filepath)
        log.debug('Detected file type: {0}'.format(doctype))
        if doctype == ooxml.DOCTYPE_EXCEL:
            return process_xlsx(filepath, field_filter_mode)
        else:
            return process_docx(filepath, field_filter_mode)
    except Exception:
        return process_docx(filepath, field_filter_mode)
Esempio n. 6
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of the process_* functions to call """
    if olefile.isOleFile(filepath):
        logger.debug('Is OLE. Checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            logger.debug('Process file as excel 2003 (xls)')
            return process_xls(filepath)

        ole = olefile.OleFileIO(filepath, path_encoding=None)
        if is_ppt(ole):
            logger.debug('is ppt - cannot have DDE')
            return u''
        logger.debug('Process file as word 2003 (doc)')
        return process_doc(ole)

    with open(filepath, 'rb') as file_handle:
        if file_handle.read(4) == RTF_START:
            logger.debug('Process file as rtf')
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        logger.debug('Detected file type: {0}'.format(doctype))
    except Exception as exc:
        logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        logger.debug('Process file as excel 2007+ (xlsx)')
        return process_xlsx(filepath)
    if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
        logger.debug('Process file as xml from excel 2003/2007+')
        return process_excel_xml(filepath)
    if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
        logger.debug('Process file as xml from word 2003/2007+')
        return process_docx(filepath)
    if doctype is None:
        logger.debug('Process file as csv')
        return process_csv(filepath)
    # could be docx; if not: this is the old default code path
    logger.debug('Process file as word 2007+ (docx)')
    return process_docx(filepath, field_filter_mode)
Esempio n. 7
0
def process_file(filepath, field_filter_mode=None):
    """decides which of the process_* functions to call"""
    if olefile.isOleFile(filepath):
        logger.debug("Is OLE. Checking streams to see whether this is xls")
        if xls_parser.is_xls(filepath):
            logger.debug("Process file as excel 2003 (xls)")
            return process_xls(filepath)
        if is_ppt(filepath):
            logger.debug("is ppt - cannot have DDE")
            return u""
        logger.debug("Process file as word 2003 (doc)")
        with olefile.OleFileIO(filepath, path_encoding=None) as ole:
            return process_doc(ole)

    with open(filepath, "rb") as file_handle:
        # TODO: here we should not assume this is a file on disk, filepath can be a file object
        if file_handle.read(4) == RTF_START:
            logger.debug("Process file as rtf")
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        logger.debug("Detected file type: {0}".format(doctype))
    except Exception as exc:
        logger.debug("Exception trying to xml-parse file: {0}".format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        logger.debug("Process file as excel 2007+ (xlsx)")
        return process_xlsx(filepath)
    if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
        logger.debug("Process file as xml from excel 2003/2007+")
        return process_excel_xml(filepath)
    if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
        logger.debug("Process file as xml from word 2003/2007+")
        return process_docx(filepath)
    if doctype is None:
        logger.debug("Process file as csv")
        return process_csv(filepath)
    # could be docx; if not: this is the old default code path
    logger.debug("Process file as word 2007+ (docx)")
    return process_docx(filepath, field_filter_mode)
Esempio n. 8
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of process_doc/x or process_xls/x to call """
    if olefile.isOleFile(filepath):
        log.debug('checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            return process_xls(filepath)
        else:
            return process_doc(filepath)
    elif open(filepath, 'rb').read(4) == b'{\\rt':
        # This is a RTF file
        return process_rtf(filepath, field_filter_mode)
    try:
        doctype = ooxml.get_type(filepath)
        log.debug('Detected file type: {0}'.format(doctype))
        if doctype == ooxml.DOCTYPE_EXCEL:
            return process_xlsx(filepath, field_filter_mode)
        else:
            return process_docx(filepath, field_filter_mode)
    except Exception:
        log.debug('Exception trying to xml-parse file', exc_info=True)
        return process_docx(filepath, field_filter_mode)