Example #1
0
def office_parser(document_page):
    logger.debug('executing')
    try:
        office_converter = OfficeConverter()
        document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
        logger.debug('document_file: %s', document_file)
        
        office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
        if office_converter.exists:
            input_filepath = office_converter.output_filepath
            logger.debug('office_converter.output_filepath: %s', input_filepath)

            pdf_parser(document_page, descriptor=open(input_filepath))
        else:
            raise ParserError

    except OfficeConversionError, msg:
        print msg
        raise ParserError
Example #2
0
    def parse(self, document_page, descriptor=None):
        logger.debug('executing')
        try:
            office_converter = OfficeConverter()
            document_file = document_page.document.document_save_to_temp_dir(document_page.document.checksum)
            logger.debug('document_file: %s', document_file)

            office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
            if office_converter.exists:
                input_filepath = office_converter.output_filepath
                logger.debug('office_converter.output_filepath: %s', input_filepath)

                # Now that the office document has been converted to PDF
                # call the coresponding PDF parser in this new file
                parse_document_page(document_page, descriptor=open(input_filepath), mimetype='application/pdf')
            else:
                raise ParserError

        except OfficeConversionError as exception:
            logger.error(exception)
            raise ParserError