Example #1
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of process_doc/x or process_xls/x to call """
    if olefile.isOleFile(filepath):
        log.debug('checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            return process_xls(filepath)
        else:
            return process_doc(filepath)

    with open(filepath, 'rb') as file_handle:
       if file_handle.read(4) == RTF_START:
            # This is a RTF file
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
    except Exception:
        log.debug('Exception trying to xml-parse file', exc_info=True)
        doctype = None

    if doctype:
        log.debug('Detected file type: {0}'.format(doctype))
    if doctype == ooxml.DOCTYPE_EXCEL:
        return process_xlsx(filepath, field_filter_mode)
    else:
        return process_docx(filepath, field_filter_mode)
Example #2
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of the process_* functions to call """
    if olefile.isOleFile(filepath):
        log.debug('Is OLE. Checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            log.debug('Process file as excel 2003 (xls)')
            return process_xls(filepath)
        else:
            log.debug('Process file as word 2003 (doc)')
            return process_doc(filepath)

    with open(filepath, 'rb') as file_handle:
        if file_handle.read(4) == RTF_START:
            log.debug('Process file as rtf')
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        log.debug('Detected file type: {0}'.format(doctype))
    except Exception as exc:
        log.debug('Exception trying to xml-parse file: {0}'.format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        log.debug('Process file as excel 2007+ (xlsx)')
        return process_xlsx(filepath)
    elif doctype is None:
        log.debug('Process file as csv')
        return process_csv(filepath)
    else:  # could be docx; if not: this is the old default code path
        log.debug('Process file as word 2007+ (docx)')
        return process_docx(filepath, field_filter_mode)
Example #3
0
    def test_all_rough(self):
        """Checks all samples, expect either ole files or good ooxml output"""
        acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \
                     ooxml.DOCTYPE_POWERPOINT

        # files that are neither OLE nor xml:
        except_files = 'empty', 'text'
        except_extns = '.xml', '.rtf', '.csv'

        # analyse all files in data dir
        for base_dir, _, files in os.walk(DATA_BASE_DIR):
            for filename in files:
                if filename in except_files:
                    if self.DO_DEBUG:
                        print('skip file: ' + filename)
                    continue
                if splitext(filename)[1] in except_extns:
                    if self.DO_DEBUG:
                        print('skip extn: ' + filename)
                    continue

                full_name = join(base_dir, filename)
                if isOleFile(full_name):
                    if self.DO_DEBUG:
                        print('skip ole: ' + filename)
                    continue
                try:
                    doctype = ooxml.get_type(full_name)
                except Exception:
                    self.fail('Failed to get doctype of {0}'.format(filename))
                self.assertTrue(doctype in acceptable,
                                msg='Doctype "{0}" for {1} not acceptable'
                                    .format(doctype, full_name))
                if self.DO_DEBUG:
                    print('ok: {0} --> {1}'.format(filename, doctype))
Example #4
0
    def test_rough_doctype(self):
        """Checks all samples, expect either ole files or good ooxml output"""
        # map from extension to expected doctype
        ext2doc = dict(
            docx=ooxml.DOCTYPE_WORD,
            docm=ooxml.DOCTYPE_WORD,
            dotx=ooxml.DOCTYPE_WORD,
            dotm=ooxml.DOCTYPE_WORD,
            xml=(ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_WORD_XML),
            xlsx=ooxml.DOCTYPE_EXCEL,
            xlsm=ooxml.DOCTYPE_EXCEL,
            xlsb=ooxml.DOCTYPE_EXCEL,
            xlam=ooxml.DOCTYPE_EXCEL,
            xltx=ooxml.DOCTYPE_EXCEL,
            xltm=ooxml.DOCTYPE_EXCEL,
            pptx=ooxml.DOCTYPE_POWERPOINT,
            pptm=ooxml.DOCTYPE_POWERPOINT,
            ppsx=ooxml.DOCTYPE_POWERPOINT,
            ppsm=ooxml.DOCTYPE_POWERPOINT,
            potx=ooxml.DOCTYPE_POWERPOINT,
            potm=ooxml.DOCTYPE_POWERPOINT,
        )

        # files that are neither OLE nor xml:
        except_files = 'empty', 'text'
        except_extns = 'rtf', 'csv'

        # analyse all files in data dir
        for base_dir, _, files in os.walk(DATA_BASE_DIR):
            for filename in files:
                if filename in except_files:
                    if self.DO_DEBUG:
                        print('skip file: ' + filename)
                    continue
                extn = splitext(filename)[1]
                if extn:
                    extn = extn[1:]  # remove the dot
                if extn in except_extns:
                    if self.DO_DEBUG:
                        print('skip extn: ' + filename)
                    continue

                full_name = join(base_dir, filename)
                if isOleFile(full_name):
                    if self.DO_DEBUG:
                        print('skip ole: ' + filename)
                    continue
                acceptable = ext2doc[extn]
                if not isinstance(acceptable, tuple):
                    acceptable = (acceptable, )
                try:
                    doctype = ooxml.get_type(full_name)
                except Exception:
                    self.fail('Failed to get doctype of {0}'.format(filename))
                self.assertTrue(
                    doctype in acceptable,
                    msg='Doctype "{0}" for {1} not acceptable'.format(
                        doctype, full_name))
                if self.DO_DEBUG:
                    print('ok: {0} --> {1}'.format(filename, doctype))
Example #5
0
    def test_all_rough(self):
        """Checks all samples, expect either ole files or good ooxml output"""
        acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \
                     ooxml.DOCTYPE_POWERPOINT
        except_files = 'empty', 'text'
        except_extns = '.xml', '.rtf'
        for base_dir, _, files in os.walk(DATA_BASE_DIR):
            for filename in files:
                if filename in except_files:
                    #print('skip file: ' + filename)
                    continue
                if splitext(filename)[1] in except_extns:
                    #print('skip extn: ' + filename)
                    continue

                full_name = join(base_dir, filename)
                if isOleFile(full_name):
                    #print('skip ole: ' + filename)
                    continue
                try:
                    doctype = ooxml.get_type(full_name)
                except Exception:
                    self.fail('Failed to get doctype of {0}'.format(filename))
                self.assertTrue(doctype in acceptable,
                                msg='Doctype "{0}" for {1} not acceptable'
                                    .format(doctype, full_name))
Example #6
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of process_doc/x or process_xls/x to call """
    if olefile.isOleFile(filepath):
        log.debug('checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            return process_xls(filepath)
        else:
            return process_doc(filepath)
    try:
        doctype = ooxml.get_type(filepath)
        log.debug('Detected file type: {0}'.format(doctype))
        if doctype == ooxml.DOCTYPE_EXCEL:
            return process_xlsx(filepath, field_filter_mode)
        else:
            return process_docx(filepath, field_filter_mode)
    except Exception:
        return process_docx(filepath, field_filter_mode)
Example #7
0
def test(*filenames):
    """ parse all given file names and print rough structure """
    logging.basicConfig(level=logging.DEBUG)
    if not filenames:
        logging.info('need file name[s]')
        return 2
    for filename in filenames:
        logging.info('checking file {0}'.format(filename))
        if not olefile.isOleFile(filename):
            logging.info('not an ole file - skip')
            continue
        xls = XlsFile(filename)

        for stream in xls.get_streams():
            logging.info(stream)
            if isinstance(stream, WorkbookStream):
                for record in stream.iter_records():
                    logging.info('  {0}'.format(record))
    return 0
Example #8
0
def test(*filenames):
    """ parse all given file names and print rough structure """
    logging.basicConfig(level=logging.DEBUG)
    if not filenames:
        logging.info('need file name[s]')
        return 2
    for filename in filenames:
        logging.info('checking file {0}'.format(filename))
        if not olefile.isOleFile(filename):
            logging.info('not an ole file - skip')
            continue
        xls = XlsFile(filename)

        for stream in xls.get_streams():
            logging.info(stream)
            if isinstance(stream, WorkbookStream):
                for record in stream.iter_records():
                    logging.info('  {0}'.format(record))
    return 0
Example #9
0
 def check(self):
     # check if it is actually an OLE file:
     oleformat = Indicator('ole_format', True, name='OLE format')
     self.indicators.append(oleformat)
     if not olefile.isOleFile(self.filename):
         oleformat.value = False
         return self.indicators
     # parse file:
     self.ole = olefile.OleFileIO(self.filename)
     # checks:
     self.check_properties()
     self.check_encrypted()
     self.check_word()
     self.check_excel()
     self.check_powerpoint()
     self.check_visio()
     self.check_ObjectPool()
     self.check_flash()
     self.ole.close()
     return self.indicators
Example #10
0
 def check(self):
     # check if it is actually an OLE file:
     oleformat = Indicator('ole_format', True, name='OLE format')
     self.indicators.append(oleformat)
     if not olefile.isOleFile(self.filename):
         oleformat.value = False
         return self.indicators
     # parse file:
     self.ole = olefile.OleFileIO(self.filename)
     # checks:
     self.check_properties()
     self.check_encrypted()
     self.check_word()
     self.check_excel()
     self.check_powerpoint()
     self.check_visio()
     self.check_ObjectPool()
     self.check_flash()
     self.ole.close()
     return self.indicators
Example #11
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of process_doc/x or process_xls/x to call """
    if olefile.isOleFile(filepath):
        log.debug('checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            return process_xls(filepath)
        else:
            return process_doc(filepath)
    elif open(filepath, 'rb').read(4) == b'{\\rt':
        # This is a RTF file
        return process_rtf(filepath, field_filter_mode)
    try:
        doctype = ooxml.get_type(filepath)
        log.debug('Detected file type: {0}'.format(doctype))
        if doctype == ooxml.DOCTYPE_EXCEL:
            return process_xlsx(filepath, field_filter_mode)
        else:
            return process_docx(filepath, field_filter_mode)
    except Exception:
        log.debug('Exception trying to xml-parse file', exc_info=True)
        return process_docx(filepath, field_filter_mode)
Example #12
0
def test(filenames,
         ole_file_class=OleRecordFile,
         must_parse=None,
         do_per_record=None,
         verbose=False):
    """ parse all given file names and print rough structure

    if an error occurs while parsing a stream of type in must_parse, the error
    will be raised. Otherwise a message is printed
    """
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    if do_per_record is None:

        def do_per_record(record):  # pylint: disable=function-redefined
            pass  # do nothing

    if not filenames:
        logging.info('need file name[s]')
        return 2
    for filename in filenames:
        logging.info('checking file {0}'.format(filename))
        if not olefile.isOleFile(filename):
            logging.info('not an ole file - skip')
            continue
        ole = ole_file_class(filename)

        for stream in ole.iter_streams():
            logging.info('  parse ' + str(stream))
            try:
                for record in stream.iter_records():
                    logging.info('    ' + str(record))
                    do_per_record(record)
            except Exception:
                if not must_parse:
                    raise
                elif isinstance(stream, must_parse):
                    raise
                else:
                    logging.info('  failed to parse', exc_info=True)
    return 0
Example #13
0
def find_ole(filename, data):
    """ try to open somehow as zip/ole/rtf/... ; yield None if fail

    If data is given, filename is (mostly) ignored.

    yields embedded ole streams in form of OleFileIO.
    """

    if data is not None:
        # isOleFile and is_ppt can work on data directly but zip need file
        # --> wrap data in a file-like object without copying data
        log.debug('working on data, file is not touched below')
        arg_for_ole = data
        arg_for_zip = FakeFile(data)
    else:
        # we only have a file name
        log.debug('working on file by name')
        arg_for_ole = filename
        arg_for_zip = filename

    ole = None
    try:
        if olefile.isOleFile(arg_for_ole):
            if is_ppt(arg_for_ole):
                log.info('is ppt file: ' + filename)
                for ole in find_ole_in_ppt(arg_for_ole):
                    yield ole
                    ole = None  # is closed in find_ole_in_ppt
            # in any case: check for embedded stuff in non-sectored streams
            log.info('is ole file: ' + filename)
            ole = olefile.OleFileIO(arg_for_ole)
            yield ole
        elif is_zipfile(arg_for_zip):
            log.info('is zip file: ' + filename)
            zipper = ZipFile(arg_for_zip, 'r')
            for subfile in zipper.namelist():
                head = b''
                try:
                    with zipper.open(subfile) as file_handle:
                        head = file_handle.read(len(olefile.MAGIC))
                except RuntimeError:
                    log.error('zip is encrypted: ' + filename)
                    yield None
                    continue

                if head == olefile.MAGIC:
                    log.info('  unzipping ole: ' + subfile)
                    with ZipSubFile(zipper, subfile) as file_handle:
                        try:
                            ole = olefile.OleFileIO(file_handle)
                            yield ole
                        except IOError:
                            log.warning('Error reading data from {0}/{1} or '
                                        'interpreting it as OLE object'.format(
                                            filename, subfile))
                            log.debug('', exc_info=True)
                        finally:
                            if ole is not None:
                                ole.close()
                                ole = None
                else:
                    log.debug('unzip skip: ' + subfile)
        else:
            log.warning(
                'open failed: {0} (or its data) is neither zip nor OLE'.format(
                    filename))
            yield None
    except Exception:
        log.error('Caught exception opening {0}'.format(filename),
                  exc_info=True)
        yield None
    finally:
        if ole is not None:
            ole.close()
Example #14
0
def process_file(filepath, field_filter_mode=None):
    """ decides to either call process_openxml or process_ole """
    if olefile.isOleFile(filepath):
        return process_ole(filepath)
    else:
        return process_openxml(filepath, field_filter_mode)
Example #15
0
def process_file(filepath):
    """ decides to either call process_openxml or process_ole """
    if olefile.isOleFile(filepath):
        return process_ole(filepath)
    else:
        return process_openxml(filepath)