Python read_templatesの例、invoice2data.extract.loader.read_templates Pythonの例

コード例 #1

0

ファイルを表示

ファイル: account_invoice_import.py プロジェクト: CURE-EMR/oca10

 def invoice2data_parse_invoice(self, file_data):
     logger.info('Trying to analyze PDF invoice with invoice2data lib')
     fd, file_name = mkstemp()
     try:
         os.write(fd, file_data)
     finally:
         os.close(fd)
     # Transfer log level of Odoo to invoice2data
     loggeri2data.setLevel(logger.getEffectiveLevel())
     local_templates_dir = tools.config.get('invoice2data_templates_dir',
                                            False)
     logger.debug('invoice2data local_templates_dir=%s',
                  local_templates_dir)
     templates = []
     if local_templates_dir and os.path.isdir(local_templates_dir):
         templates += read_templates(local_templates_dir)
     exclude_built_in_templates = tools.config.get(
         'invoice2data_exclude_built_in_templates', False)
     if not exclude_built_in_templates:
         templates += read_templates()
     logger.debug('Calling invoice2data.extract_data with templates=%s',
                  templates)
     try:
         invoice2data_res = extract_data(file_name, templates=templates)
     except Exception, e:
         raise UserError(
             _("PDF Invoice parsing failed. Error message: %s") % e)

コード例 #2

0

ファイルを表示

def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.cmdlist:
        cmdlist = args.cmdlist.split("+")
    if args.imgcmd:
        imgcmd = args.imgcmd.split("+")
    else:
        imgcmd = None
    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []

    for f in args.input_files:
        res, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(f.name, templates=templates, input_module=args.input_reader, cmdlist=cmdlist, conv_cmdlist=imgcmd, tid=args.tid)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    #date=res["date"].strftime("%Y-%m-%d"),
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    generate_output(output, output_name=args.output_name, output_date_format=args.output_date_format, output_module=args.output_format)

    sys.exit(missed)

コード例 #3

0

ファイルを表示

def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    input_module = input_mapping[args.input_reader]
    output_module = output_mapping[args.output_format]

    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []
    for f in args.input_files:
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    if output_module is not None:
        output_module.write_to_file(output, args.output_name,
                                    args.output_date_format)

コード例 #4

0

ファイルを表示

def read_file(filename, debug):
    # If debug is active, get PDF as string for debugging/template creation
    if debug is True:
        with open('INPUT/' + filename, 'rb') as f:
            pdf = pdftotext.PDF(f)
        print('\n\n'.join(pdf))

    templates = read_templates('TEMPLATES/')
    result = extract_data('INPUT/' + filename, templates, pdftotextdef)
    # if pdf read successful write JSON file
    if result != False:
        to_json.write_to_file(
            result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json',
            '%Y-%m-%d')

        # checks if due_date present in JSON and if not sets due date 1 month after invoice date
        with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json',
                  'r+') as file:
            data = json.load(file)
            if "date_due" not in data:
                date = data["date"]
                date_obj = datetime.strptime(date, '%Y-%m-%d')
                json_in = {
                    "date_due":
                    helper_functions.add_month(date_obj).strftime('%Y-%m-%d')
                }
                data.update(json_in)
                file.seek(0)
                json.dump(data, file, indent=4, sort_keys=True)
    # else add file name to error list and move on
    else:
        helper_functions.append_error(filename)

コード例 #5

0

ファイルを表示

def extract_data(invoicefile, templates=None, input_module=pdftotext, debug=False):
    if debug:
        logging.basicConfig(level=logging.DEBUG)

    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    default_template = next((t for t in templates if 'default' in t['template_name']),None)
    if default_template:
        logger.error("Falling back to default template.")
        return default_template.extract(default_template.prepare_input(extracted_str))

    # Master template
    logger.error('No template for %s', invoicefile)
    return False

コード例 #6

0

ファイルを表示

def extract_data(invoicefile, templates=None, input_module=pdftotext):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode("utf-8")

    logger.debug("START pdftotext result ===========================")
    logger.debug(extracted_str)
    logger.debug("END pdftotext result =============================")

    logger.debug("Testing {} template files".format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)
        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error("No template for %s", invoicefile)
    return False

コード例 #7

0

ファイルを表示

ファイル: filedata.py プロジェクト: shahrukhgellani/DemoFRE

def extract_multi(file_refrence, catagory):
    pdf_splitter(file_refrence, catagory)
    global total
    pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf"))
    print(pages)
    result = ''
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    for page in pages:
        # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page)
        result += to_table(extract_data(page, templates=templates), page)
    remove_file()
    ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total)
    total = 0
    return ret_data

コード例 #8

0

ファイルを表示

ファイル: invoice_manipulation.py プロジェクト: joco8/Invoice-Handler

 def __init__(self, invDirectory, filename):
     templates = read_templates(cwd + '/tplf')
     pdfFile = invDirectory + '/' + filename
     result = extract_data(pdfFile, templates=templates)
     print(result)
     print(filename)
     if(result): 
         for item in result.keys():
             self.__setattr__(item, result[item]) 
     else:
         print()
         print()
         print()
         self.__setattr__("issuer", "null")
         self.__setattr__("filename", filename)

コード例 #9

0

ファイルを表示

def get_parsed_data(templates, extracted_str, partiallyExtracted):
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    if templates is None:
        templates = read_templates()

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str, partiallyExtracted)

    return False

コード例 #10

0

ファイルを表示

ファイル: main.py プロジェクト: us241098/invoice2data

def extract_data(invoicefile, templates=None, input_module=pdftotext):
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False

コード例 #11

0

ファイルを表示

ファイル: test.py プロジェクト: aksonlyaks/invoice2data

def thread_fn(file, dummy):
    total_missed = 0
    total_corrected = 0
    total_line_with_issue = 0

    cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"]
    templates = read_templates('./templates')
    result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID)

    total_missed = total_missed + missed
    total_corrected = total_corrected + corrected
    total_line_with_issue = total_line_with_issue + len(issue_lines)

    #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================'
    if missed != 0 or qtyerr != "Match":
        logger.error(CYELLOW + report + CEND)
        issue_list.append(file+"\t"+report)
    else:
        logger.error(CGREEN + report + CEND)

コード例 #12

0

ファイルを表示

def extract_invoice_details(filename):
    if filename != '':
        filename_splitted = filename.split('.')
        # 1.Case for images
        if filename_splitted[-1] != 'pdf':
            # makingSearchablePDF = MakingSearchablePDFs()
            filename = MakingSearchablePDFs.convert_image_to_searchable_pdf(
                filename)
            # filename = convert_image_to_searchable_pdf(filename)
        # 2.Case for pdfs
        elif filename_splitted[-1] == 'pdf':
            pass

        # YAML Template System
        source = 'input/uploads/' + filename
        templates = read_templates('Templates/')
        result = extract_data(source, templates=templates)
        print('\n', type(result))
        print('\n', result)

        # from json import dumps
        # print(dumps(datetime.now(), default=json_serial))
        if result != False:
            destination = 'output/processed/' + filename
            json_data = json.dumps(result,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
            print(type(json_data), json_data)
            # shutil.move(source, destination)
        else:
            destination = 'output/failed/' + filename
            print('Failed for Processing of Invoice!!!')

        # Move processed file to respective actioned folder.
        # shutil.move(source, destination)
        # json_data = json.dumps(result)
        # print('\n', type(json_data))
        with open(destination + name_of_image + '_json.json', 'w') as file:
            file.write(result)

コード例 #13

0

ファイルを表示

import os
from pathlib import Path
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import tesseract4
import xlsxwriter
from my_logger import logger

logger.name = "FX"
work_dir_path = Path(__file__).parent / 'work_dir'
invoice_folder = work_dir_path / "original"
templates = read_templates(work_dir_path / "templates")

# file_name = "invoice-test.pdf"
#file_name = "57001194492019.tiff"
#file_path = work_dir_path / "original" / file_name
#result = extract_data(str(file_path), templates=templates, input_module=tesseract4)
#print(result)


#Excel setting
def no_change(val):
    return val


def to_number(val):
    return float(val.replace(',', ''))


workbook = xlsxwriter.Workbook('invoices.xlsx')
worksheet = workbook.add_worksheet()

コード例 #14

0

ファイルを表示

ファイル: filedata.py プロジェクト: shahrukhgellani/DemoFRE

def extract(name):
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    result = extract_data(name, templates=templates)
    return result

コード例 #15

0

ファイルを表示

ファイル: pdfinvoice2data.py プロジェクト: senthilsweb/Invoicepdf2Data

from flask import Flask, render_template, request
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import yaml
import glob
import errno
import os
import pymongo
#import PyPDF2

app = Flask(__name__)

read_template = read_templates(folder="invoice_templates")

pdfFiles = []
pdfFiles1 = []
pdfFiles2 = []
filenames = []
path_filename = []

#path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files"
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["pdfinvdata"]
mycol = mydb["invoicedata"]

for filename in os.listdir(
        '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/'
):
    if filename.endswith(".pdf"):
        filenames.append(filename)
        pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str(

コード例 #16

0

ファイルを表示

#!/Library/Frameworks/Python.framework/Versions/3.7/bin/python3

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import sys

file = sys.argv[1]

filename = '/Applications/XAMPP/xamppfiles/htdocs/invoice-app/admin/' + file

templates = read_templates('/Users/krzemson/Desktop/aed/tpl')
result = extract_data(filename, templates=templates)

print(json.dumps(result))

コード例 #17

0

ファイルを表示

ファイル: invoice2data.py プロジェクト: hilali-msc/financial-documents-ocr-deep-learning

# Importing all the required libraries

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import pdftotext
import pandas as pd

# Importing custom template
templates = read_templates('./template/')

#print(templates)

# Extract data from PDF
result = extract_data('./data/pnlsheet.pdf',
                      templates=templates,
                      input_module=pdftotext)

# Store the extracted data to a Data-frame
df = pd.DataFrame(data=result)

# Export Data-frame to a csv file
df.to_csv('./data/invoice2data_simple.csv')
''' 
You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional
and by default pdftotext will be used if not specified.

The custom template named temp.yml is placed in the templates. You can remove the templates parameter in
extract_data(). Default templates will be used

'''

コード例 #18

0

ファイルを表示

ファイル: test_cli.py プロジェクト: yucer/invoice2data

 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()

コード例 #19

0

ファイルを表示

ファイル: extract_data.py プロジェクト: audacesk/blue_ml

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import datetime
import argparse

if __name__ == '__main__':
    # Initialize the arguments parser
    parser = argparse.ArgumentParser(description="Extract data from invoices")

    # Add the parameters positional/optional
    parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str)
    parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str)

    # Parse the arguments
    args = parser.parse_args()

    templates = read_templates(args.templates_dirpath)

    output_data = extract_data(args.invoice_path, templates=templates)

    date_time = output_data['date'].strftime('%Y-%m-%d')
    output_data['date'] = date_time
    # then print the formatted JSON data output
    print(json.dumps(output_data, indent=2))

コード例 #20

0

ファイルを表示

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import os

templates = read_templates('./datasets')
result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)

コード例 #21

0

ファイルを表示

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import sys

filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf'
#filename = sys.argv[1]
templates = read_templates(
    'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates')
print(templates)

result = extract_data(filename, templates=templates)
#print("\n")
print(result)
#print("Working inside the jojo code")

# Preprocessing: re-arrange and re-formating the extracted output
'''
date = result['date'].strftime('%d, %b %Y')
total = result['total']
invoice_number = result['invoice_number']
addr_from = result['From_Address']
addr_to = result['To_Address']
'''

コード例 #22

0

ファイルを表示

def parse_pdf(pdf_path):
    templates = read_templates('peco_assistant/data/templates')
    results = extract_data(pdf_path, templates=templates)
    return results

コード例 #23

0

ファイルを表示

ファイル: test_extraction.py プロジェクト: Chatbot123/pdftotext

 def setUp(self):
     self.templates = read_templates()

コード例 #24

0

ファイルを表示

ファイル: test_cli.py プロジェクト: m3nu/invoice2data

 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()

コード例 #25

0

ファイルを表示

 def read_templates(self):
     return read_templates(settings.TEMPLATE_DIR)

コード例 #26

0

ファイルを表示

def extract_data(invoicefile, templates=None, input_module="png", cmdlist=None, conv_cmdlist=None, tid=None):
    """Extracts structured data from PDF/image invoices.
˜
    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    try:
        t = None
        if templates is None:
            templates = read_templates()

        input_module = input_mapping[input_module]
        
        logger.error("Input tid is %s and Input module is %s", tid, input_module)
        for tt in templates:
            if t != None:
                break
            if "tid" in tt.options:
                for tid_option in tt.options["tid"]:
                    if str(tid_option) == tid:
                        t = tt
                        logger.error(f'Template found based on tid {t.options["tid"]} {t["issuer"]}')
                        
                
        if t != None and "psm" in t.options:
            logger.error("PSM is %d", t.options["psm"])
            if str(t.options["psm"]) == "3":
                cmdlist = copy.deepcopy(cmdlist_psm3)
            else:
                cmdlist_psm6[6] = str(t.options["psm"])
                cmdlist = copy.deepcopy(cmdlist_psm6)
        
        if t!=None and "imgcmd" in t.options:
            logger.error("imgcmd is %s", t.options["imgcmd"])
            conv_cmdlist = t.options["imgcmd"]
            
        # print(templates[0])
        extracted_str = input_module.to_text(invoicefile, cmdlist=cmdlist, conv_cmdlist=conv_cmdlist).decode("utf-8")

        logger.debug("START pdftotext result ===========================")
        logger.error(extracted_str)
        logger.debug("END pdftotext result =============================")

        logger.debug("Testing {} template files".format(len(templates)))
        missed = -1
        corrected = -1
        issue_lines = []
        qtyerr = ""
        noofitem = -1
        output = []
        if t == None:
            for t in templates:
                optimized_str = t.prepare_input(extracted_str)

                if t.matches_input(optimized_str):
                    return t.extract(optimized_str)
        else:
            optimized_str = t.prepare_input(extracted_str) 
            output = t.extract(optimized_str)
            if t != None and "decimal" in t.options:
                missed, corrected, issue_lines, qtyerr, noofitem = post_process(output, t.options)
            
            return output, missed, corrected, issue_lines, qtyerr, noofitem

        logger.error("No template for %s", invoicefile)
        return output, missed, corrected, issue_lines, qtyerr, noofitem
    except Exception as ex:
        logger.error("Exception occured in invoice conversion "+ str(ex))

    return False

コード例 #27

0

ファイルを表示

import os
import invoice2data as ntd
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
file_name = 'NT_01.pdf'
temp_name = 'pdf2inv.py'
file_path = os.path.join(
    r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485',
    file_name)
temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates',
                         temp_name)
print(file_path, temp_path)

templates = read_templates(temp_path)
result = extract_data(file_path, templates=templates)

コード例 #28

0

ファイルを表示

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates

templates = read_templates('test.pdf')
result = extract_data('test.pdf', templates=templates)

# st = "Total: 4.00 4,123.00"
# result = st.split()
# result.pop(1)
# print(" ".join(result))

コード例 #29

0

ファイルを表示

ファイル: main.py プロジェクト: jfitoussi/invoice2data

def extract_data(invoicefile, templates=None, input_module=pdftotext, with_extracted_str=False, **kwargs):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    # print(templates[0])
    extracted_str = input_module.to_text(invoicefile, **kwargs).decode('utf-8')
    try:
        extracted_str = extracted_str.replace(u'\xa0', u' ').replace(u"\xc2", " ")
    except:
        pass

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    data = data_from_template(templates, extracted_str)
    try:
        if data:
            if with_extracted_str:
                return data, extracted_str
            return data
    except ValueError:
        logger.warning("Exception in parsing", exc_info=True)

    logger.warning('No template for %s', invoicefile)
    if with_extracted_str:
        return False, extracted_str
    return False