def main(args=None): """Take folder or single file and analyze each.""" # Extract Args if args is None: parser = create_parser() args = parser.parse_args() # Configure Debug if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) # Load Templates templates = read_templates() # Load templates from external folder if set. if args.template_folder: templates += read_templates(os.path.abspath(args.template_folder)) # Extracting data output = [] for f in args.input_files: input_module = select_input_module(f.name) res = extract_data(f.name, templates=templates, input_module=input_module) res["file_name"] = f.name if res: logger.info(res) output.append(res) f.close() # Writing Output # TODO: fix output file to_json.write_to_file(output, args.output_name, args.output_date_format)
def extract_data(invoicefile, templates=None, input_module=pdftotext): """Extracts structured data from PDF/image invoices. This function uses the text extracted from a PDF file or image and pre-defined regex templates to find structured data. Args: invoicefile (`Str`): path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf") templates (`list`, optional, default: `None`): Templates are loaded using `read_template` function in `loader.py` input_module (:obj:`{pdftotext, tesseract}`, optional, default:`pdftotext`): library to be used to extract text from given `invoicefile`, Returns: dict or False extracted and matched fields or False if no template matches """ # Loading Templates if None if templates is None: templates = read_templates() # Extracting text extracted_str = input_module.to_text(invoicefile).decode("utf-8") # Logging extracted text logger.debug("START pdftotext result ===========================") logger.debug(extracted_str) logger.debug("END pdftotext result =============================") logger.debug("Testing {} template files".format(len(templates))) # iterate through all templates to find suitable template. for template in templates: # preprocess input optimized_str = template.prepare_input(extracted_str) # extract if keywords match if template.match_keywords(optimized_str): return template.extract_info(optimized_str) logger.error("No template for %s", invoicefile) return False
def extract_data(invoicefile, templates=None, input_module=pdftotext): """Extracts structured data from PDF/image invoices. This function uses the text extracted from a PDF file or image and pre-defined regex templates to find structured data. Reads template if no template assigned Required fields are matches from templates Parameters ---------- invoicefile : str path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf") templates : list of instances of class `InvoiceTemplate`, optional Templates are loaded using `read_template` function in `loader.py` input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional library to be used to extract text from given `invoicefile`, Returns ------- dict or False extracted and matched fields or False if no template matches Notes ----- Import required `input_module` when using invoice2data as a library See Also -------- read_template : Function where templates are loaded InvoiceTemplate : Class representing single template files that live as .yml files on the disk Examples -------- When using `invoice2data` as an library >>> from invoice2data.input import pdftotext >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext) {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087', 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'} """ if templates is None: templates = read_templates() # print(templates[0]) extracted_str = input_module.to_text(invoicefile).decode('utf-8') tried_tesseract = False if len(extracted_str) < 20 and input_module == pdftotext: extracted_str = tesseract.to_text(invoicefile).decode('utf-8') tried_tesseract = True logger.debug('START pdftotext result ===========================') logger.debug(extracted_str) logger.debug('END pdftotext result =============================') logger.debug('Testing {} template files'.format(len(templates))) #get page count of invoicefile pdf = None pageCount = None try: pdf = PdfFileReader(open(invoicefile, 'rb')) pageCount = pdf.getNumPages() except Exception as e: #print(e.message) logger.error(str(e)) pass for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): if pageCount is not None and pageCount > 1: #dealing with pages confirmInTemplate = False for k, v in t['fields'].items(): if k == 'multiple_page' and v == 'True': confirmInTemplate = True break if confirmInTemplate: try: pdfdirectory = os.path.dirname(invoicefile) pdfname = os.path.basename(invoicefile) for i in range(pdf.numPages): #split multi-page pdf file into multiple pdf files output = PdfFileWriter() output.addPage(pdf.getPage(i)) objectfile = join( pdfdirectory, pdfname.replace('.pdf', '').replace( '.PDF', '') + '_' + str(i) + '.pdf') with open(objectfile, "wb") as outputStream: output.write(outputStream) os.remove(invoicefile) logger.warning( 'Seperate pdf into multiple files, process in next scanning loop' ) return 'pdf seperated' except Exception as e: #print(e.message) logger.error(str(e)) pass ret = t.extract(optimized_str, invoicefile) if ret is not None and ret is not False: return ret if not tried_tesseract: logger.debug( 'No template matched, now try tesseract ===========================' ) tried_tesseract = True extracted_str2 = tesseract.to_text(invoicefile).decode('utf-8') logger.debug('START tesseract result ===========================') logger.debug(extracted_str2) logger.debug('END tesseract result =============================') logger.debug('Testing {} template files'.format(len(templates))) for t in templates: optimized_str2 = t.prepare_input(extracted_str2) if t.matches_input(optimized_str2): tesseract_result = t.extract(optimized_str2, invoicefile) if tesseract_result is None: #tesseract find the right template, but do not match all required fields #so use this template and use pdf2text string optimized_str = t.prepare_input(extracted_str) return t.extract(optimized_str, invoicefile) else: return tesseract_result #print('No template for ' + invoicefile) logger.error('No template for %s', invoicefile) return False
def main2(args=None): """Take folder or single file and analyze each.""" if args is None: parser = create_parser() args = parser.parse_args() args['output_date_format'] = '%Y-%m-%d' input_module = input_mapping['pdftotext'] output_module = output_mapping[args['output_format']] templates = [] # Load templates from external folder if set. # if args['template_folder']: # templates += read_templates(os.path.abspath(args['template_folder'])) # Load internal templates, if not disabled. # if not args['exclude_built_in_templates']: if 'template_folder' in args: templates += read_templates(os.path.abspath(args['template_folder'])) else: templates += read_templates() output = [] for fs in args['input_files']: f = open(fs, 'r') res = extract_data(f.name, templates=templates, input_module=input_module) if res == 'pdf seperated': continue re = None if res: logger.info(res) output.append(res) if args['dbpass'] is not None: re = output_module.write_to_db( res, f.name, args['output_date_format'], args['dbhost'], args['dbuser'], args['dbpass'], args['dbname'], args['azure_account'], args['azure_key'], args['pdf_path']) f.close() if args['dbpass'] is not None: #move source pdf pdfdirectory = os.path.dirname(f.name) pdfpath = f.name pdfname = os.path.basename(f.name) if re == 'succeed': #move to successful if args['pdf_succeed']: succeed_path = args['pdf_succeed'] else: #succeed_path = join(pdfdirectory, 'successful') #move to public successful folder where clients can access succeed_path = os.path.abspath( os.path.join(pdfdirectory, os.pardir)) succeed_path = join(succeed_path, 'successful') from datetime import date succeed_path = join(succeed_path, date.today().strftime('%d-%m-%Y')) try: if not os.path.exists(succeed_path): os.makedirs(succeed_path) destinateFile = join(succeed_path, pdfname) shutil.move(pdfpath, destinateFile) except: if args['pdf_moved_failed']: succeed_path = args['pdf_moved_failed'] else: succeed_path = join(pdfdirectory, 'failedToMove') succeed_path = join(succeed_path, date.today().strftime('%d-%m-%Y')) if not os.path.exists(succeed_path): os.makedirs(succeed_path) destinateFile = join(succeed_path, pdfname) shutil.move(pdfpath, destinateFile) pass elif re == 'link db failed': pass elif re == 'exists': #delete print('data already exists in edms: ' + pdfname) os.remove(pdfpath) pass else: #move to failed if args['pdf_failed']: failed_path = args['pdf_failed'] else: father_path = os.path.abspath( os.path.join(pdfdirectory, os.pardir)) failed_path = join(father_path, 'failed') if not os.path.exists(failed_path): os.makedirs(failed_path) destinateFile = join(failed_path, pdfname) shutil.move(pdfpath, destinateFile) pass if output_module is not None: if args['dbpass'] is not None: pass #for data base output, do it in loop of extracting else: logger.warning(output) output_module.write_to_file(output, args['output_name'], args['output_date_format']) return res
def main(args=None): """Take folder or single file and analyze each.""" if args is None: parser = create_parser() args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) input_module = input_mapping[args.input_reader] output_module = output_mapping[args.output_format] templates = [] # Load templates from external folder if set. if args.template_folder: templates += read_templates(os.path.abspath(args.template_folder)) # Load internal templates, if not disabled. if not args.exclude_built_in_templates: templates += read_templates() output = [] for f in args.input_files: res = extract_data(f.name, templates=templates, input_module=input_module) if res == 'pdf seperated': continue re = None if res: logger.info(res) output.append(res) if args.dbpass is not None: re = output_module.write_to_db(res, f.name, args.output_date_format, args.dbhost, args.dbuser, args.dbpass, args.dbname, args.azure_account, args.azure_key, args.pdf_path) if args.copy: filename = args.filename.format( date=res['date'].strftime('%Y-%m-%d'), invoice_number=res['invoice_number'], desc=res['desc'], ) shutil.copyfile(f.name, join(args.copy, filename)) if args.move: filename = args.filename.format( date=res['date'].strftime('%Y-%m-%d'), invoice_number=res['invoice_number'], desc=res['desc'], ) shutil.move(f.name, join(args.move, filename)) f.close() if args.dbpass is not None: #move source pdf pdfdirectory = os.path.dirname(f.name) #failedTemp pdfpath = f.name pdfname = os.path.basename(f.name) if re == 'succeed': #move to successful #succeed_path = join(pdfdirectory, 'successful') #move to public successful folder where clients can access succeed_path = os.path.abspath( os.path.join(pdfdirectory, os.pardir)) succeed_path = join(succeed_path, 'successful') from datetime import date succeed_path = join(succeed_path, date.today().strftime('%d-%m-%Y')) if not os.path.exists(succeed_path): os.makedirs(succeed_path) destinateFile = join(succeed_path, pdfname) shutil.move(pdfpath, destinateFile) pass elif re == 'link db failed': pass elif re == 'exists': #delete os.remove(pdfpath) pass else: #move to failed failed_path = join(pdfdirectory, 'failed') if not os.path.exists(failed_path): os.makedirs(failed_path) destinateFile = join(failed_path, pdfname) shutil.move(pdfpath, destinateFile) pass if output_module is not None: if args.dbpass is not None: pass #for data base output, do it in loop of extracting else: output_module.write_to_file(output, args.output_name, args.output_date_format)
def extract_data(invoicefile, templates=None, input_module_lang="deu"): """Extracts structured data from PDF/image invoices. This function uses the text extracted from a PDF file or image and pre-defined regex templates to find structured data. Reads template if no template assigned Required fields are matches from templates Parameters ---------- invoicefile : str path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf") templates : list of instances of class `InvoiceTemplate`, optional Templates are loaded using `read_template` function in `loader.py` input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional library to be used to extract text from given `invoicefile`, Returns ------- dict or False extracted and matched fields or False if no template matches Notes ----- Import required `input_module` when using invoice2data as a library See Also -------- read_template : Function where templates are loaded InvoiceTemplate : Class representing single template files that live as .yml files on the disk Examples -------- When using `invoice2data` as an library >>> from invoice2data.input import pdftotext >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext) {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087', 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'} """ if templates is None: templates = read_templates() tesseracted = False extracted_str = pdftotext.to_text(invoicefile, input_module_lang).decode("utf-8") if extracted_str.strip() == "": logger.debug("No extractable text, running OCR...") extracted_str = tesseract.to_text(invoicefile, input_module_lang).decode("utf-8") tesseracted = True logger.debug("START text result ===========================") logger.debug(extracted_str) logger.debug("END text result =============================") logger.debug("Testing {} template files".format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) if tesseracted: logger.error("No template for %s", invoicefile) return False logger.debug("No template match! Re-reading...") extracted_str = tesseract.to_text(invoicefile, input_module_lang).decode("utf-8") logger.debug("START tesseract result ===========================") logger.debug(extracted_str) logger.debug("END tesseract result =============================") logger.debug("Testing {} template files".format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) logger.error("No template for %s", invoicefile) return False
def main(args=None): """Take folder or single file and analyze each.""" if args is None: parser = create_parser() args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) input_module = input_mapping[args.input_reader] input_module_lang = "deu" if args.input_reader_lang: input_module_lang = args.input_reader_lang output_module = output_mapping[args.output_format] templates = [] # Load templates from external folder if set. if args.template_folder: templates += read_templates(os.path.abspath(args.template_folder)) # Load internal templates, if not disabled. if not args.exclude_built_in_templates: templates += read_templates() output = [] for f in args.input_files: res = extract_data(f.name, templates=templates, input_module_lang=input_module_lang) if res: logger.info(res) output.append(res) if args.copy: filename = args.filename.format( date=res["date"].strftime("%Y-%m-%d"), invoice_number=makeFilename(res["invoice_number"]), desc=makeFilename(res["desc"]), ) newName = join(args.copy, filename) shutil.copyfile(f.name, newName) if args.anonymize: from anonymize import anonymize anonymize(newName, args.anonymize) if args.move: filename = args.filename.format( date=res["date"].strftime("%Y-%m-%d"), invoice_number=makeFilename(res["invoice_number"]), desc=makeFilename(res["desc"]), ) newName = join(args.move, filename) shutil.move(f.name, join(args.move, filename)) if args.anonymize: from anonymize import anonymize anonymize(newName, args.anonymize) f.close() if output_module is not None: output_module.write_to_file(output, args.output_name, args.output_date_format)