Ejemplos de read_templates en Python, ejemplos de invoice2data.extract.loader.read_templates en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: account_invoice_import.py Proyecto: CURE-EMR/oca10

 def invoice2data_parse_invoice(self, file_data):
     logger.info('Trying to analyze PDF invoice with invoice2data lib')
     fd, file_name = mkstemp()
     try:
         os.write(fd, file_data)
     finally:
         os.close(fd)
     # Transfer log level of Odoo to invoice2data
     loggeri2data.setLevel(logger.getEffectiveLevel())
     local_templates_dir = tools.config.get('invoice2data_templates_dir',
                                            False)
     logger.debug('invoice2data local_templates_dir=%s',
                  local_templates_dir)
     templates = []
     if local_templates_dir and os.path.isdir(local_templates_dir):
         templates += read_templates(local_templates_dir)
     exclude_built_in_templates = tools.config.get(
         'invoice2data_exclude_built_in_templates', False)
     if not exclude_built_in_templates:
         templates += read_templates()
     logger.debug('Calling invoice2data.extract_data with templates=%s',
                  templates)
     try:
         invoice2data_res = extract_data(file_name, templates=templates)
     except Exception, e:
         raise UserError(
             _("PDF Invoice parsing failed. Error message: %s") % e)

Ejemplo n.º 2

0

Mostrar archivo

def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.cmdlist:
        cmdlist = args.cmdlist.split("+")
    if args.imgcmd:
        imgcmd = args.imgcmd.split("+")
    else:
        imgcmd = None
    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []

    for f in args.input_files:
        res, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(f.name, templates=templates, input_module=args.input_reader, cmdlist=cmdlist, conv_cmdlist=imgcmd, tid=args.tid)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    #date=res["date"].strftime("%Y-%m-%d"),
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    generate_output(output, output_name=args.output_name, output_date_format=args.output_date_format, output_module=args.output_format)

    sys.exit(missed)

Ejemplo n.º 3

0

Mostrar archivo

def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    input_module = input_mapping[args.input_reader]
    output_module = output_mapping[args.output_format]

    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []
    for f in args.input_files:
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    if output_module is not None:
        output_module.write_to_file(output, args.output_name,
                                    args.output_date_format)

Ejemplo n.º 4

0

Mostrar archivo

def read_file(filename, debug):
    # If debug is active, get PDF as string for debugging/template creation
    if debug is True:
        with open('INPUT/' + filename, 'rb') as f:
            pdf = pdftotext.PDF(f)
        print('\n\n'.join(pdf))

    templates = read_templates('TEMPLATES/')
    result = extract_data('INPUT/' + filename, templates, pdftotextdef)
    # if pdf read successful write JSON file
    if result != False:
        to_json.write_to_file(
            result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json',
            '%Y-%m-%d')

        # checks if due_date present in JSON and if not sets due date 1 month after invoice date
        with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json',
                  'r+') as file:
            data = json.load(file)
            if "date_due" not in data:
                date = data["date"]
                date_obj = datetime.strptime(date, '%Y-%m-%d')
                json_in = {
                    "date_due":
                    helper_functions.add_month(date_obj).strftime('%Y-%m-%d')
                }
                data.update(json_in)
                file.seek(0)
                json.dump(data, file, indent=4, sort_keys=True)
    # else add file name to error list and move on
    else:
        helper_functions.append_error(filename)

Ejemplo n.º 5

0

Mostrar archivo

def extract_data(invoicefile, templates=None, input_module=pdftotext, debug=False):
    if debug:
        logging.basicConfig(level=logging.DEBUG)

    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    default_template = next((t for t in templates if 'default' in t['template_name']),None)
    if default_template:
        logger.error("Falling back to default template.")
        return default_template.extract(default_template.prepare_input(extracted_str))

    # Master template
    logger.error('No template for %s', invoicefile)
    return False

Ejemplo n.º 6

0

Mostrar archivo

def extract_data(invoicefile, templates=None, input_module=pdftotext):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode("utf-8")

    logger.debug("START pdftotext result ===========================")
    logger.debug(extracted_str)
    logger.debug("END pdftotext result =============================")

    logger.debug("Testing {} template files".format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)
        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error("No template for %s", invoicefile)
    return False

Ejemplo n.º 7

0

Mostrar archivo

Archivo: filedata.py Proyecto: shahrukhgellani/DemoFRE

def extract_multi(file_refrence, catagory):
    pdf_splitter(file_refrence, catagory)
    global total
    pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf"))
    print(pages)
    result = ''
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    for page in pages:
        # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page)
        result += to_table(extract_data(page, templates=templates), page)
    remove_file()
    ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total)
    total = 0
    return ret_data

Ejemplo n.º 8

0

Mostrar archivo

Archivo: invoice_manipulation.py Proyecto: joco8/Invoice-Handler

 def __init__(self, invDirectory, filename):
     templates = read_templates(cwd + '/tplf')
     pdfFile = invDirectory + '/' + filename
     result = extract_data(pdfFile, templates=templates)
     print(result)
     print(filename)
     if(result): 
         for item in result.keys():
             self.__setattr__(item, result[item]) 
     else:
         print()
         print()
         print()
         self.__setattr__("issuer", "null")
         self.__setattr__("filename", filename)

Ejemplo n.º 9

0

Mostrar archivo

def get_parsed_data(templates, extracted_str, partiallyExtracted):
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    if templates is None:
        templates = read_templates()

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str, partiallyExtracted)

    return False

Ejemplo n.º 10

0

Mostrar archivo

Archivo: main.py Proyecto: us241098/invoice2data

def extract_data(invoicefile, templates=None, input_module=pdftotext):
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test.py Proyecto: aksonlyaks/invoice2data

def thread_fn(file, dummy):
    total_missed = 0
    total_corrected = 0
    total_line_with_issue = 0

    cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"]
    templates = read_templates('./templates')
    result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID)

    total_missed = total_missed + missed
    total_corrected = total_corrected + corrected
    total_line_with_issue = total_line_with_issue + len(issue_lines)

    #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================'
    if missed != 0 or qtyerr != "Match":
        logger.error(CYELLOW + report + CEND)
        issue_list.append(file+"\t"+report)
    else:
        logger.error(CGREEN + report + CEND)

Ejemplo n.º 12

0

Mostrar archivo

def extract_invoice_details(filename):
    if filename != '':
        filename_splitted = filename.split('.')
        # 1.Case for images
        if filename_splitted[-1] != 'pdf':
            # makingSearchablePDF = MakingSearchablePDFs()
            filename = MakingSearchablePDFs.convert_image_to_searchable_pdf(
                filename)
            # filename = convert_image_to_searchable_pdf(filename)
        # 2.Case for pdfs
        elif filename_splitted[-1] == 'pdf':
            pass

        # YAML Template System
        source = 'input/uploads/' + filename
        templates = read_templates('Templates/')
        result = extract_data(source, templates=templates)
        print('\n', type(result))
        print('\n', result)

        # from json import dumps
        # print(dumps(datetime.now(), default=json_serial))
        if result != False:
            destination = 'output/processed/' + filename
            json_data = json.dumps(result,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
            print(type(json_data), json_data)
            # shutil.move(source, destination)
        else:
            destination = 'output/failed/' + filename
            print('Failed for Processing of Invoice!!!')

        # Move processed file to respective actioned folder.
        # shutil.move(source, destination)
        # json_data = json.dumps(result)
        # print('\n', type(json_data))
        with open(destination + name_of_image + '_json.json', 'w') as file:
            file.write(result)

Ejemplo n.º 13

0

Mostrar archivo

import os
from pathlib import Path
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import tesseract4
import xlsxwriter
from my_logger import logger

logger.name = "FX"
work_dir_path = Path(__file__).parent / 'work_dir'
invoice_folder = work_dir_path / "original"
templates = read_templates(work_dir_path / "templates")

# file_name = "invoice-test.pdf"
#file_name = "57001194492019.tiff"
#file_path = work_dir_path / "original" / file_name
#result = extract_data(str(file_path), templates=templates, input_module=tesseract4)
#print(result)


#Excel setting
def no_change(val):
    return val


def to_number(val):
    return float(val.replace(',', ''))


workbook = xlsxwriter.Workbook('invoices.xlsx')
worksheet = workbook.add_worksheet()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: filedata.py Proyecto: shahrukhgellani/DemoFRE

def extract(name):
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    result = extract_data(name, templates=templates)
    return result

Ejemplo n.º 15

0

Mostrar archivo

Archivo: pdfinvoice2data.py Proyecto: senthilsweb/Invoicepdf2Data

from flask import Flask, render_template, request
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import yaml
import glob
import errno
import os
import pymongo
#import PyPDF2

app = Flask(__name__)

read_template = read_templates(folder="invoice_templates")

pdfFiles = []
pdfFiles1 = []
pdfFiles2 = []
filenames = []
path_filename = []

#path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files"
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["pdfinvdata"]
mycol = mydb["invoicedata"]

for filename in os.listdir(
        '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/'
):
    if filename.endswith(".pdf"):
        filenames.append(filename)
        pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str(

Ejemplo n.º 16

0

Mostrar archivo

#!/Library/Frameworks/Python.framework/Versions/3.7/bin/python3

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import sys

file = sys.argv[1]

filename = '/Applications/XAMPP/xamppfiles/htdocs/invoice-app/admin/' + file

templates = read_templates('/Users/krzemson/Desktop/aed/tpl')
result = extract_data(filename, templates=templates)

print(json.dumps(result))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: invoice2data.py Proyecto: hilali-msc/financial-documents-ocr-deep-learning

# Importing all the required libraries

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import pdftotext
import pandas as pd

# Importing custom template
templates = read_templates('./template/')

#print(templates)

# Extract data from PDF
result = extract_data('./data/pnlsheet.pdf',
                      templates=templates,
                      input_module=pdftotext)

# Store the extracted data to a Data-frame
df = pd.DataFrame(data=result)

# Export Data-frame to a csv file
df.to_csv('./data/invoice2data_simple.csv')
''' 
You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional
and by default pdftotext will be used if not specified.

The custom template named temp.yml is placed in the templates. You can remove the templates parameter in
extract_data(). Default templates will be used

'''

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_cli.py Proyecto: yucer/invoice2data

 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: extract_data.py Proyecto: audacesk/blue_ml

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import datetime
import argparse

if __name__ == '__main__':
    # Initialize the arguments parser
    parser = argparse.ArgumentParser(description="Extract data from invoices")

    # Add the parameters positional/optional
    parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str)
    parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str)

    # Parse the arguments
    args = parser.parse_args()

    templates = read_templates(args.templates_dirpath)

    output_data = extract_data(args.invoice_path, templates=templates)

    date_time = output_data['date'].strftime('%Y-%m-%d')
    output_data['date'] = date_time
    # then print the formatted JSON data output
    print(json.dumps(output_data, indent=2))

Ejemplo n.º 20

0

Mostrar archivo

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import os

templates = read_templates('./datasets')
result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)

Ejemplo n.º 21

0

Mostrar archivo

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import sys

filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf'
#filename = sys.argv[1]
templates = read_templates(
    'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates')
print(templates)

result = extract_data(filename, templates=templates)
#print("\n")
print(result)
#print("Working inside the jojo code")

# Preprocessing: re-arrange and re-formating the extracted output
'''
date = result['date'].strftime('%d, %b %Y')
total = result['total']
invoice_number = result['invoice_number']
addr_from = result['From_Address']
addr_to = result['To_Address']
'''

Ejemplo n.º 22

0

Mostrar archivo

def parse_pdf(pdf_path):
    templates = read_templates('peco_assistant/data/templates')
    results = extract_data(pdf_path, templates=templates)
    return results

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test_extraction.py Proyecto: Chatbot123/pdftotext

 def setUp(self):
     self.templates = read_templates()

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_cli.py Proyecto: m3nu/invoice2data

 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()

Ejemplo n.º 25

0

Mostrar archivo

 def read_templates(self):
     return read_templates(settings.TEMPLATE_DIR)

Ejemplo n.º 26

0

Mostrar archivo

def extract_data(invoicefile, templates=None, input_module="png", cmdlist=None, conv_cmdlist=None, tid=None):
    """Extracts structured data from PDF/image invoices.
˜
    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    try:
        t = None
        if templates is None:
            templates = read_templates()

        input_module = input_mapping[input_module]
        
        logger.error("Input tid is %s and Input module is %s", tid, input_module)
        for tt in templates:
            if t != None:
                break
            if "tid" in tt.options:
                for tid_option in tt.options["tid"]:
                    if str(tid_option) == tid:
                        t = tt
                        logger.error(f'Template found based on tid {t.options["tid"]} {t["issuer"]}')
                        
                
        if t != None and "psm" in t.options:
            logger.error("PSM is %d", t.options["psm"])
            if str(t.options["psm"]) == "3":
                cmdlist = copy.deepcopy(cmdlist_psm3)
            else:
                cmdlist_psm6[6] = str(t.options["psm"])
                cmdlist = copy.deepcopy(cmdlist_psm6)
        
        if t!=None and "imgcmd" in t.options:
            logger.error("imgcmd is %s", t.options["imgcmd"])
            conv_cmdlist = t.options["imgcmd"]
            
        # print(templates[0])
        extracted_str = input_module.to_text(invoicefile, cmdlist=cmdlist, conv_cmdlist=conv_cmdlist).decode("utf-8")

        logger.debug("START pdftotext result ===========================")
        logger.error(extracted_str)
        logger.debug("END pdftotext result =============================")

        logger.debug("Testing {} template files".format(len(templates)))
        missed = -1
        corrected = -1
        issue_lines = []
        qtyerr = ""
        noofitem = -1
        output = []
        if t == None:
            for t in templates:
                optimized_str = t.prepare_input(extracted_str)

                if t.matches_input(optimized_str):
                    return t.extract(optimized_str)
        else:
            optimized_str = t.prepare_input(extracted_str) 
            output = t.extract(optimized_str)
            if t != None and "decimal" in t.options:
                missed, corrected, issue_lines, qtyerr, noofitem = post_process(output, t.options)
            
            return output, missed, corrected, issue_lines, qtyerr, noofitem

        logger.error("No template for %s", invoicefile)
        return output, missed, corrected, issue_lines, qtyerr, noofitem
    except Exception as ex:
        logger.error("Exception occured in invoice conversion "+ str(ex))

    return False

Ejemplo n.º 27

0

Mostrar archivo

import os
import invoice2data as ntd
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
file_name = 'NT_01.pdf'
temp_name = 'pdf2inv.py'
file_path = os.path.join(
    r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485',
    file_name)
temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates',
                         temp_name)
print(file_path, temp_path)

templates = read_templates(temp_path)
result = extract_data(file_path, templates=templates)

Ejemplo n.º 28

0

Mostrar archivo

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates

templates = read_templates('test.pdf')
result = extract_data('test.pdf', templates=templates)

# st = "Total: 4.00 4,123.00"
# result = st.split()
# result.pop(1)
# print(" ".join(result))

Ejemplo n.º 29

0

Mostrar archivo

Archivo: main.py Proyecto: jfitoussi/invoice2data

def extract_data(invoicefile, templates=None, input_module=pdftotext, with_extracted_str=False, **kwargs):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    # print(templates[0])
    extracted_str = input_module.to_text(invoicefile, **kwargs).decode('utf-8')
    try:
        extracted_str = extracted_str.replace(u'\xa0', u' ').replace(u"\xc2", " ")
    except:
        pass

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    data = data_from_template(templates, extracted_str)
    try:
        if data:
            if with_extracted_str:
                return data, extracted_str
            return data
    except ValueError:
        logger.warning("Exception in parsing", exc_info=True)

    logger.warning('No template for %s', invoicefile)
    if with_extracted_str:
        return False, extracted_str
    return False