Ejemplo n.º 1
0
 def invoice2data_parse_invoice(self, file_data):
     logger.info('Trying to analyze PDF invoice with invoice2data lib')
     fd, file_name = mkstemp()
     try:
         os.write(fd, file_data)
     finally:
         os.close(fd)
     # Transfer log level of Odoo to invoice2data
     loggeri2data.setLevel(logger.getEffectiveLevel())
     local_templates_dir = tools.config.get('invoice2data_templates_dir',
                                            False)
     logger.debug('invoice2data local_templates_dir=%s',
                  local_templates_dir)
     templates = []
     if local_templates_dir and os.path.isdir(local_templates_dir):
         templates += read_templates(local_templates_dir)
     exclude_built_in_templates = tools.config.get(
         'invoice2data_exclude_built_in_templates', False)
     if not exclude_built_in_templates:
         templates += read_templates(
             pkg_resources.resource_filename('invoice2data', 'templates'))
     logger.debug('Calling invoice2data.extract_data with templates=%s',
                  templates)
     try:
         invoice2data_res = extract_data(file_name, templates=templates)
     except Exception, e:
         raise UserError(
             _("PDF Invoice parsing failed. Error message: %s") % e)
Ejemplo n.º 2
0
def main():
    "Take folder or single file and analyze each."

    parser = argparse.ArgumentParser(description='Process some integers.')

    parser.add_argument('--debug', dest='debug', action='store_true',
                        help='Print debug information.')

    parser.add_argument('--copy', '-c', dest='copy',
                        help='Copy renamed PDFs to specified folder.')

    parser.add_argument('--template-folder', '-t', dest='template_folder',
                        help='Folder containing invoice templates in yml file. Always adds built-in templates.')
    
    parser.add_argument('--exclude-built-in-templates', dest='exclude_built_in_templates',
                        default=False, help='Ignore built-in templates.', action="store_true")

    parser.add_argument('--csv-output', '-o', dest='csv_output_name', default='invoices-output.csv',
                        help='Custom name for output CSV.')

    parser.add_argument('input_files', type=argparse.FileType('r'), nargs='+',
                        help='File or directory to analyze.')


    args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    templates = []
    
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates(pkg_resources.resource_filename('invoice2data', 'templates'))
    
    output = []
    for f in args.input_files:
        res = extract_data(f.name, templates=templates)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = FILENAME.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    desc=res['desc'])
                shutil.copyfile(f.name, join(args.copy, filename))
    invoices_to_csv(output, args.csv_output_name)
Ejemplo n.º 3
0
def main():
    "Take folder or single file and analyze each."

    parser = argparse.ArgumentParser(description='Process some integers.')

    parser.add_argument('--debug', dest='debug', action='store_true',
                        help='Print debug information.')

    parser.add_argument('--copy', '-c', dest='copy',
                        help='Copy renamed PDFs to specified folder.')

    parser.add_argument('--template-folder', '-t', dest='template_folder',
                        help='Folder containing invoice templates in yml file. Always adds built-in templates.')
    
    parser.add_argument('--exclude-built-in-templates', dest='exclude_built_in_templates',
                        default=False, help='Ignore built-in templates.', action="store_true")
    
    parser.add_argument('input_files', type=argparse.FileType('r'), nargs='+',
                        help='File or directory to analyze.')

    args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    templates = []
    
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates(pkg_resources.resource_filename('invoice2data', 'templates'))
    
    output = []
    for f in args.input_files:
        res = extract_data(f.name, templates=templates)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = FILENAME.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    desc=res['desc'])
                shutil.copyfile(f.name, join(args.copy, filename))
    invoices_to_csv(output, 'invoices-output.csv')
Ejemplo n.º 4
0
def extract_data(invoicefile, templates=None, debug=False):
    if templates is None:
        templates = read_templates(
            pkg_resources.resource_filename('invoice2data', 'templates'))
    
    extracted_str = pdftotext.to_text(invoicefile).decode('utf-8')

    charcount = len(extracted_str)
    logger.debug('number of char in pdf2text extract: %d', charcount)
    # Disable OCR for now.
    #if charcount < 40:
        #logger.info('Starting OCR')
        #extracted_str = image_to_text.to_text(invoicefile)
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False
Ejemplo n.º 5
0
def extract_data(invoicefile, templates=None, debug=False):
    if templates is None:
        templates = read_templates(
            pkg_resources.resource_filename('invoice2data', 'templates'))
    
    extracted_str = pdftotext.to_text(invoicefile).decode('utf-8')

    charcount = len(extracted_str)
    logger.debug('number of char in pdf2text extract: %d', charcount)
    # Disable Tesseract for now.
    #if charcount < 40:
        #logger.info('Starting OCR')
        #extracted_str = image_to_text.to_text(invoicefile)
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False
Ejemplo n.º 6
0
def main():
    "Take folder or single file and analyze each."

    parser = argparse.ArgumentParser(description='Process some integers.')

    parser.add_argument('--debug', dest='debug', action='store_true',
                        help='Print debug information.')

    parser.add_argument('--copy', '-c', dest='copy',
                        help='Copy renamed PDFs to specified folder.')

    parser.add_argument('--template-folder', '-t', dest='template_folder',
                        default=pkg_resources.resource_filename('invoice2data', 'templates'),
                        help='Folder containing invoice templates in yml file. Required.')

    parser.add_argument('--output-folder', '-o', dest='output_folder', default='.',
                        help='Folder to place output csv file.')

    parser.add_argument('input_files', type=argparse.FileType('r'), nargs='+',
                        help='File or directory to analyze.')

    args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    output = []
    templates = read_templates(args.template_folder)
    for f in args.input_files:
        res = extract_data(f.name, templates=templates)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = FILENAME.format(
                    ref=res['invoice_number'],
                    type=res['transaction_type'],
                    acc=res['account'])
                shutil.copyfile(f.name, join(args.copy, filename))
    output_name = join(args.output_folder, 'invoices-output-{0}.csv'.format(time()))
    invoices_to_csv(output, output_name)
Ejemplo n.º 7
0
def extract_data(invoicefile, templates=None, debug=False, encoding='ASCII7'):
    """
    Args:
        invoicefile (str): a path to an invoice file

    Returns:


    """
    if templates is None:
        templates = read_templates(
            pkg_resources.resource_filename('invoice2data', 'templates'))

    if (invoicefile.lower().endswith(".txt")):
        textfile = open(invoicefile, "r")
        extracted_str = textfile.read()
    else:
        extracted_str = pdftotext.to_text(invoicefile, encoding=encoding)

    if encoding=='ASCII7':
        extracted_str = replace_unicode_characters(extracted_str)

    charcount = len(extracted_str)
    logger.debug('number of char in pdf2text extract: %d', charcount)
    # Disable Tesseract for now.
    #if charcount < 40:
        #logger.info('Starting OCR')
        #extracted_str = image_to_text.to_text(invoicefile)
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        logger.debug('Trying template {}'.format(t))
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False
Ejemplo n.º 8
0
 def setUp(self):
     self.templates = read_templates(
         pkg_resources.resource_filename('invoice2data', 'templates'))
Ejemplo n.º 9
0
 def setUp(self):
     self.templates = read_templates(
         pkg_resources.resource_filename('invoice2data', 'templates'))
Ejemplo n.º 10
0
def main():
    "Take folder or single file and analyze each."

    parser = argparse.ArgumentParser(description='Process some integers.')

    parser.add_argument('--debug', dest='debug', action='store_true',
                        help='Print debug information.')

    parser.add_argument('--copy', '-c', dest='copy',
                        help='Copy renamed PDFs to specified folder.')

    parser.add_argument('--template-folder', '-t', dest='template_folder',
                        help='Folder containing invoice templates in yml file. Always adds built-in templates.')

    parser.add_argument('--exclude-built-in-templates', dest='exclude_built_in_templates',
                        default=False, help='Ignore built-in templates.', action="store_true")

    parser.add_argument('--include-file-name', dest='include_file_name',
                    default=False, help='Write the file name of the quote in the report.', action="store_true")

    parser.add_argument('--report-per-vendor', dest='report_per_vendor',
                        default=False, help='Generates a seperate report for each vendor.', action="store_true")

    parser.add_argument('--encoding', dest='encoding',
                        default='ASCII7', help='Encoding of the text')

    parser.add_argument('--extension', dest='extension', type=str,
                        default='pdf', help='File extension (pdf or txt)')

    parser.add_argument('--input_files', type=str, nargs='+',
                        help='Files to analyze.')

    parser.add_argument('--output-directory', type=str, default='.', dest='output_dir',
                        help='Out directory for the report files.')

    parser.add_argument('input_directory', help='Input directory with PDF files to analyze.')

    args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    templates = []

    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates(pkg_resources.resource_filename('invoice2data', 'templates'))

    output = []
    out_per_issuer = dict()
    if args.input_files:
        files = args.input_files
    else:
        files = glob.iglob(args.input_directory + '/*.'+args.extension)

    for file_name in files:
        logging.info("processing file %s" % file_name)
        res = extract_data(file_name, templates=templates, encoding=args.encoding)

        if res:
            if res['issuer'] in out_per_issuer.keys():
                out_per_issuer[res['issuer']].append(res)
            else:
                out_per_issuer[res['issuer']] = [res]

            if args.include_file_name:
                basename = os.path.basename(file_name)
                res['file_name'] = basename
                pdf_file_name = basename.replace('.txt','.pdf')
                res['hyperlink'] =  '=HYPERLINK("%s", "%s")' % ('Q:\\'+pdf_file_name, basename[11:27])

            try:
                pdf_title = pdftotext.get_document_title(file_name)
                logging.info("file title: %s" % pdf_title)
                res['title'] = pdf_title
            except:
                logging.info("%s doesn't have a title... using filename instaed" % file_name)
                res['title'] = file_name
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = FILENAME.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    desc=res['desc'])
                shutil.copyfile(f.name, join(args.copy, filename))

    if args.report_per_vendor:
        for issuer, invoices in out_per_issuer.iteritems():
            write_issuer_invoices(issuer, invoices, args.encoding, args.output_dir)
    else:
        invoices_to_csv(output, os.path.join(args.output_dir, 'invoices-output.csv'))