def invoice2data_parse_invoice(self, file_data): logger.info('Trying to analyze PDF invoice with invoice2data lib') fd, file_name = mkstemp() try: os.write(fd, file_data) finally: os.close(fd) # Transfer log level of Odoo to invoice2data loggeri2data.setLevel(logger.getEffectiveLevel()) local_templates_dir = tools.config.get('invoice2data_templates_dir', False) logger.debug('invoice2data local_templates_dir=%s', local_templates_dir) templates = [] if local_templates_dir and os.path.isdir(local_templates_dir): templates += read_templates(local_templates_dir) exclude_built_in_templates = tools.config.get( 'invoice2data_exclude_built_in_templates', False) if not exclude_built_in_templates: templates += read_templates() logger.debug('Calling invoice2data.extract_data with templates=%s', templates) try: invoice2data_res = extract_data(file_name, templates=templates) except Exception, e: raise UserError( _("PDF Invoice parsing failed. Error message: %s") % e)
def main(args=None): """Take folder or single file and analyze each.""" if args is None: parser = create_parser() args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if args.cmdlist: cmdlist = args.cmdlist.split("+") if args.imgcmd: imgcmd = args.imgcmd.split("+") else: imgcmd = None templates = [] # Load templates from external folder if set. if args.template_folder: templates += read_templates(os.path.abspath(args.template_folder)) # Load internal templates, if not disabled. if not args.exclude_built_in_templates: templates += read_templates() output = [] for f in args.input_files: res, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(f.name, templates=templates, input_module=args.input_reader, cmdlist=cmdlist, conv_cmdlist=imgcmd, tid=args.tid) if res: logger.info(res) output.append(res) if args.copy: filename = args.filename.format( #date=res["date"].strftime("%Y-%m-%d"), date=res["date"], invoice_number=res["invoice_number"], desc=res["desc"], ) shutil.copyfile(f.name, join(args.copy, filename)) if args.move: filename = args.filename.format( date=res["date"], invoice_number=res["invoice_number"], desc=res["desc"], ) shutil.move(f.name, join(args.move, filename)) f.close() generate_output(output, output_name=args.output_name, output_date_format=args.output_date_format, output_module=args.output_format) sys.exit(missed)
def main(args=None): """Take folder or single file and analyze each.""" if args is None: parser = create_parser() args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) input_module = input_mapping[args.input_reader] output_module = output_mapping[args.output_format] templates = [] # Load templates from external folder if set. if args.template_folder: templates += read_templates(os.path.abspath(args.template_folder)) # Load internal templates, if not disabled. if not args.exclude_built_in_templates: templates += read_templates() output = [] for f in args.input_files: res = extract_data(f.name, templates=templates, input_module=input_module) if res: logger.info(res) output.append(res) if args.copy: filename = args.filename.format( date=res['date'].strftime('%Y-%m-%d'), invoice_number=res['invoice_number'], desc=res['desc'], ) shutil.copyfile(f.name, join(args.copy, filename)) if args.move: filename = args.filename.format( date=res['date'].strftime('%Y-%m-%d'), invoice_number=res['invoice_number'], desc=res['desc'], ) shutil.move(f.name, join(args.move, filename)) f.close() if output_module is not None: output_module.write_to_file(output, args.output_name, args.output_date_format)
def read_file(filename, debug): # If debug is active, get PDF as string for debugging/template creation if debug is True: with open('INPUT/' + filename, 'rb') as f: pdf = pdftotext.PDF(f) print('\n\n'.join(pdf)) templates = read_templates('TEMPLATES/') result = extract_data('INPUT/' + filename, templates, pdftotextdef) # if pdf read successful write JSON file if result != False: to_json.write_to_file( result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json', '%Y-%m-%d') # checks if due_date present in JSON and if not sets due date 1 month after invoice date with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json', 'r+') as file: data = json.load(file) if "date_due" not in data: date = data["date"] date_obj = datetime.strptime(date, '%Y-%m-%d') json_in = { "date_due": helper_functions.add_month(date_obj).strftime('%Y-%m-%d') } data.update(json_in) file.seek(0) json.dump(data, file, indent=4, sort_keys=True) # else add file name to error list and move on else: helper_functions.append_error(filename)
def extract_data(invoicefile, templates=None, input_module=pdftotext, debug=False): if debug: logging.basicConfig(level=logging.DEBUG) if templates is None: templates = read_templates() extracted_str = input_module.to_text(invoicefile).decode('utf-8') logger.debug('START pdftotext result ===========================') logger.debug(extracted_str) logger.debug('END pdftotext result =============================') logger.debug('Testing {} template files'.format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) default_template = next((t for t in templates if 'default' in t['template_name']),None) if default_template: logger.error("Falling back to default template.") return default_template.extract(default_template.prepare_input(extracted_str)) # Master template logger.error('No template for %s', invoicefile) return False
def extract_data(invoicefile, templates=None, input_module=pdftotext): """Extracts structured data from PDF/image invoices. This function uses the text extracted from a PDF file or image and pre-defined regex templates to find structured data. Reads template if no template assigned Required fields are matches from templates Parameters ---------- invoicefile : str path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf") templates : list of instances of class `InvoiceTemplate`, optional Templates are loaded using `read_template` function in `loader.py` input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional library to be used to extract text from given `invoicefile`, Returns ------- dict or False extracted and matched fields or False if no template matches Notes ----- Import required `input_module` when using invoice2data as a library See Also -------- read_template : Function where templates are loaded InvoiceTemplate : Class representing single template files that live as .yml files on the disk Examples -------- When using `invoice2data` as an library >>> from invoice2data.input import pdftotext >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext) {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087', 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'} """ if templates is None: templates = read_templates() extracted_str = input_module.to_text(invoicefile).decode("utf-8") logger.debug("START pdftotext result ===========================") logger.debug(extracted_str) logger.debug("END pdftotext result =============================") logger.debug("Testing {} template files".format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) logger.error("No template for %s", invoicefile) return False
def extract_multi(file_refrence, catagory): pdf_splitter(file_refrence, catagory) global total pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf")) print(pages) result = '' templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates') for page in pages: # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page) result += to_table(extract_data(page, templates=templates), page) remove_file() ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total) total = 0 return ret_data
def __init__(self, invDirectory, filename): templates = read_templates(cwd + '/tplf') pdfFile = invDirectory + '/' + filename result = extract_data(pdfFile, templates=templates) print(result) print(filename) if(result): for item in result.keys(): self.__setattr__(item, result[item]) else: print() print() print() self.__setattr__("issuer", "null") self.__setattr__("filename", filename)
def get_parsed_data(templates, extracted_str, partiallyExtracted): logger.debug('START pdftotext result ===========================') logger.debug(extracted_str) logger.debug('END pdftotext result =============================') if templates is None: templates = read_templates() logger.debug('Testing {} template files'.format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str, partiallyExtracted) return False
def extract_data(invoicefile, templates=None, input_module=pdftotext): if templates is None: templates = read_templates() extracted_str = input_module.to_text(invoicefile).decode('utf-8') logger.debug('START pdftotext result ===========================') logger.debug(extracted_str) logger.debug('END pdftotext result =============================') logger.debug('Testing {} template files'.format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) logger.error('No template for %s', invoicefile) return False
def thread_fn(file, dummy): total_missed = 0 total_corrected = 0 total_line_with_issue = 0 cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"] templates = read_templates('./templates') result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID) total_missed = total_missed + missed total_corrected = total_corrected + corrected total_line_with_issue = total_line_with_issue + len(issue_lines) #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================' if missed != 0 or qtyerr != "Match": logger.error(CYELLOW + report + CEND) issue_list.append(file+"\t"+report) else: logger.error(CGREEN + report + CEND)
def extract_invoice_details(filename): if filename != '': filename_splitted = filename.split('.') # 1.Case for images if filename_splitted[-1] != 'pdf': # makingSearchablePDF = MakingSearchablePDFs() filename = MakingSearchablePDFs.convert_image_to_searchable_pdf( filename) # filename = convert_image_to_searchable_pdf(filename) # 2.Case for pdfs elif filename_splitted[-1] == 'pdf': pass # YAML Template System source = 'input/uploads/' + filename templates = read_templates('Templates/') result = extract_data(source, templates=templates) print('\n', type(result)) print('\n', result) # from json import dumps # print(dumps(datetime.now(), default=json_serial)) if result != False: destination = 'output/processed/' + filename json_data = json.dumps(result, indent=4, sort_keys=True, default=str) print(type(json_data), json_data) # shutil.move(source, destination) else: destination = 'output/failed/' + filename print('Failed for Processing of Invoice!!!') # Move processed file to respective actioned folder. # shutil.move(source, destination) # json_data = json.dumps(result) # print('\n', type(json_data)) with open(destination + name_of_image + '_json.json', 'w') as file: file.write(result)
import os from pathlib import Path from invoice2data import extract_data from invoice2data.extract.loader import read_templates from invoice2data.input import tesseract4 import xlsxwriter from my_logger import logger logger.name = "FX" work_dir_path = Path(__file__).parent / 'work_dir' invoice_folder = work_dir_path / "original" templates = read_templates(work_dir_path / "templates") # file_name = "invoice-test.pdf" #file_name = "57001194492019.tiff" #file_path = work_dir_path / "original" / file_name #result = extract_data(str(file_path), templates=templates, input_module=tesseract4) #print(result) #Excel setting def no_change(val): return val def to_number(val): return float(val.replace(',', '')) workbook = xlsxwriter.Workbook('invoices.xlsx') worksheet = workbook.add_worksheet()
def extract(name): templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates') result = extract_data(name, templates=templates) return result
from flask import Flask, render_template, request from invoice2data import extract_data from invoice2data.extract.loader import read_templates import yaml import glob import errno import os import pymongo #import PyPDF2 app = Flask(__name__) read_template = read_templates(folder="invoice_templates") pdfFiles = [] pdfFiles1 = [] pdfFiles2 = [] filenames = [] path_filename = [] #path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files" myclient = pymongo.MongoClient("mongodb://localhost:27017/") mydb = myclient["pdfinvdata"] mycol = mydb["invoicedata"] for filename in os.listdir( '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' ): if filename.endswith(".pdf"): filenames.append(filename) pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str(
#!/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 from invoice2data import extract_data from invoice2data.extract.loader import read_templates import json import sys file = sys.argv[1] filename = '/Applications/XAMPP/xamppfiles/htdocs/invoice-app/admin/' + file templates = read_templates('/Users/krzemson/Desktop/aed/tpl') result = extract_data(filename, templates=templates) print(json.dumps(result))
# Importing all the required libraries from invoice2data import extract_data from invoice2data.extract.loader import read_templates from invoice2data.input import pdftotext import pandas as pd # Importing custom template templates = read_templates('./template/') #print(templates) # Extract data from PDF result = extract_data('./data/pnlsheet.pdf', templates=templates, input_module=pdftotext) # Store the extracted data to a Data-frame df = pd.DataFrame(data=result) # Export Data-frame to a csv file df.to_csv('./data/invoice2data_simple.csv') ''' You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional and by default pdftotext will be used if not specified. The custom template named temp.yml is placed in the templates. You can remove the templates parameter in extract_data(). Default templates will be used '''
def setUp(self): self.templates = read_templates() self.parser = create_parser()
from invoice2data import extract_data from invoice2data.extract.loader import read_templates import json import datetime import argparse if __name__ == '__main__': # Initialize the arguments parser parser = argparse.ArgumentParser(description="Extract data from invoices") # Add the parameters positional/optional parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str) parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str) # Parse the arguments args = parser.parse_args() templates = read_templates(args.templates_dirpath) output_data = extract_data(args.invoice_path, templates=templates) date_time = output_data['date'].strftime('%Y-%m-%d') output_data['date'] = date_time # then print the formatted JSON data output print(json.dumps(output_data, indent=2))
from invoice2data import extract_data from invoice2data.extract.loader import read_templates import os templates = read_templates('./datasets') result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)
from invoice2data import extract_data from invoice2data.extract.loader import read_templates import sys filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf' #filename = sys.argv[1] templates = read_templates( 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates') print(templates) result = extract_data(filename, templates=templates) #print("\n") print(result) #print("Working inside the jojo code") # Preprocessing: re-arrange and re-formating the extracted output ''' date = result['date'].strftime('%d, %b %Y') total = result['total'] invoice_number = result['invoice_number'] addr_from = result['From_Address'] addr_to = result['To_Address'] '''
def parse_pdf(pdf_path): templates = read_templates('peco_assistant/data/templates') results = extract_data(pdf_path, templates=templates) return results
def setUp(self): self.templates = read_templates()
def read_templates(self): return read_templates(settings.TEMPLATE_DIR)
def extract_data(invoicefile, templates=None, input_module="png", cmdlist=None, conv_cmdlist=None, tid=None): """Extracts structured data from PDF/image invoices. ˜ This function uses the text extracted from a PDF file or image and pre-defined regex templates to find structured data. Reads template if no template assigned Required fields are matches from templates Parameters ---------- invoicefile : str path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf") templates : list of instances of class `InvoiceTemplate`, optional Templates are loaded using `read_template` function in `loader.py` input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional library to be used to extract text from given `invoicefile`, Returns ------- dict or False extracted and matched fields or False if no template matches Notes ----- Import required `input_module` when using invoice2data as a library See Also -------- read_template : Function where templates are loaded InvoiceTemplate : Class representing single template files that live as .yml files on the disk Examples -------- When using `invoice2data` as an library >>> from invoice2data.input import pdftotext >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext) {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087', 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'} """ logging.basicConfig(stream=sys.stdout, level=logging.INFO) try: t = None if templates is None: templates = read_templates() input_module = input_mapping[input_module] logger.error("Input tid is %s and Input module is %s", tid, input_module) for tt in templates: if t != None: break if "tid" in tt.options: for tid_option in tt.options["tid"]: if str(tid_option) == tid: t = tt logger.error(f'Template found based on tid {t.options["tid"]} {t["issuer"]}') if t != None and "psm" in t.options: logger.error("PSM is %d", t.options["psm"]) if str(t.options["psm"]) == "3": cmdlist = copy.deepcopy(cmdlist_psm3) else: cmdlist_psm6[6] = str(t.options["psm"]) cmdlist = copy.deepcopy(cmdlist_psm6) if t!=None and "imgcmd" in t.options: logger.error("imgcmd is %s", t.options["imgcmd"]) conv_cmdlist = t.options["imgcmd"] # print(templates[0]) extracted_str = input_module.to_text(invoicefile, cmdlist=cmdlist, conv_cmdlist=conv_cmdlist).decode("utf-8") logger.debug("START pdftotext result ===========================") logger.error(extracted_str) logger.debug("END pdftotext result =============================") logger.debug("Testing {} template files".format(len(templates))) missed = -1 corrected = -1 issue_lines = [] qtyerr = "" noofitem = -1 output = [] if t == None: for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) else: optimized_str = t.prepare_input(extracted_str) output = t.extract(optimized_str) if t != None and "decimal" in t.options: missed, corrected, issue_lines, qtyerr, noofitem = post_process(output, t.options) return output, missed, corrected, issue_lines, qtyerr, noofitem logger.error("No template for %s", invoicefile) return output, missed, corrected, issue_lines, qtyerr, noofitem except Exception as ex: logger.error("Exception occured in invoice conversion "+ str(ex)) return False
import os import invoice2data as ntd from invoice2data import extract_data from invoice2data.extract.loader import read_templates file_name = 'NT_01.pdf' temp_name = 'pdf2inv.py' file_path = os.path.join( r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485', file_name) temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates', temp_name) print(file_path, temp_path) templates = read_templates(temp_path) result = extract_data(file_path, templates=templates)
from invoice2data import extract_data from invoice2data.extract.loader import read_templates templates = read_templates('test.pdf') result = extract_data('test.pdf', templates=templates) # st = "Total: 4.00 4,123.00" # result = st.split() # result.pop(1) # print(" ".join(result))
def extract_data(invoicefile, templates=None, input_module=pdftotext, with_extracted_str=False, **kwargs): """Extracts structured data from PDF/image invoices. This function uses the text extracted from a PDF file or image and pre-defined regex templates to find structured data. Reads template if no template assigned Required fields are matches from templates Parameters ---------- invoicefile : str path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf") templates : list of instances of class `InvoiceTemplate`, optional Templates are loaded using `read_template` function in `loader.py` input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional library to be used to extract text from given `invoicefile`, Returns ------- dict or False extracted and matched fields or False if no template matches Notes ----- Import required `input_module` when using invoice2data as a library See Also -------- read_template : Function where templates are loaded InvoiceTemplate : Class representing single template files that live as .yml files on the disk Examples -------- When using `invoice2data` as an library >>> from invoice2data.input import pdftotext >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext) {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087', 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'} """ if templates is None: templates = read_templates() # print(templates[0]) extracted_str = input_module.to_text(invoicefile, **kwargs).decode('utf-8') try: extracted_str = extracted_str.replace(u'\xa0', u' ').replace(u"\xc2", " ") except: pass logger.debug('START pdftotext result ===========================') logger.debug(extracted_str) logger.debug('END pdftotext result =============================') logger.debug('Testing {} template files'.format(len(templates))) data = data_from_template(templates, extracted_str) try: if data: if with_extracted_str: return data, extracted_str return data except ValueError: logger.warning("Exception in parsing", exc_info=True) logger.warning('No template for %s', invoicefile) if with_extracted_str: return False, extracted_str return False