Ejemplo n.º 1
0
 def invoice2data_parse_invoice(self, file_data):
     logger.info('Trying to analyze PDF invoice with invoice2data lib')
     fd, file_name = mkstemp()
     try:
         os.write(fd, file_data)
     finally:
         os.close(fd)
     # Transfer log level of Odoo to invoice2data
     loggeri2data.setLevel(logger.getEffectiveLevel())
     local_templates_dir = tools.config.get('invoice2data_templates_dir',
                                            False)
     logger.debug('invoice2data local_templates_dir=%s',
                  local_templates_dir)
     templates = []
     if local_templates_dir and os.path.isdir(local_templates_dir):
         templates += read_templates(local_templates_dir)
     exclude_built_in_templates = tools.config.get(
         'invoice2data_exclude_built_in_templates', False)
     if not exclude_built_in_templates:
         templates += read_templates()
     logger.debug('Calling invoice2data.extract_data with templates=%s',
                  templates)
     try:
         invoice2data_res = extract_data(file_name, templates=templates)
     except Exception, e:
         raise UserError(
             _("PDF Invoice parsing failed. Error message: %s") % e)
Ejemplo n.º 2
0
def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.cmdlist:
        cmdlist = args.cmdlist.split("+")
    if args.imgcmd:
        imgcmd = args.imgcmd.split("+")
    else:
        imgcmd = None
    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []

    for f in args.input_files:
        res, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(f.name, templates=templates, input_module=args.input_reader, cmdlist=cmdlist, conv_cmdlist=imgcmd, tid=args.tid)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    #date=res["date"].strftime("%Y-%m-%d"),
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res["date"],
                    invoice_number=res["invoice_number"],
                    desc=res["desc"],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    generate_output(output, output_name=args.output_name, output_date_format=args.output_date_format, output_module=args.output_format)

    sys.exit(missed)
Ejemplo n.º 3
0
def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    input_module = input_mapping[args.input_reader]
    output_module = output_mapping[args.output_format]

    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []
    for f in args.input_files:
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()

    if output_module is not None:
        output_module.write_to_file(output, args.output_name,
                                    args.output_date_format)
Ejemplo n.º 4
0
def read_file(filename, debug):
    # If debug is active, get PDF as string for debugging/template creation
    if debug is True:
        with open('INPUT/' + filename, 'rb') as f:
            pdf = pdftotext.PDF(f)
        print('\n\n'.join(pdf))

    templates = read_templates('TEMPLATES/')
    result = extract_data('INPUT/' + filename, templates, pdftotextdef)
    # if pdf read successful write JSON file
    if result != False:
        to_json.write_to_file(
            result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json',
            '%Y-%m-%d')

        # checks if due_date present in JSON and if not sets due date 1 month after invoice date
        with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json',
                  'r+') as file:
            data = json.load(file)
            if "date_due" not in data:
                date = data["date"]
                date_obj = datetime.strptime(date, '%Y-%m-%d')
                json_in = {
                    "date_due":
                    helper_functions.add_month(date_obj).strftime('%Y-%m-%d')
                }
                data.update(json_in)
                file.seek(0)
                json.dump(data, file, indent=4, sort_keys=True)
    # else add file name to error list and move on
    else:
        helper_functions.append_error(filename)
Ejemplo n.º 5
0
def extract_data(invoicefile, templates=None, input_module=pdftotext, debug=False):
    if debug:
        logging.basicConfig(level=logging.DEBUG)

    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    default_template = next((t for t in templates if 'default' in t['template_name']),None)
    if default_template:
        logger.error("Falling back to default template.")
        return default_template.extract(default_template.prepare_input(extracted_str))

    # Master template
    logger.error('No template for %s', invoicefile)
    return False
Ejemplo n.º 6
0
def extract_data(invoicefile, templates=None, input_module=pdftotext):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode("utf-8")

    logger.debug("START pdftotext result ===========================")
    logger.debug(extracted_str)
    logger.debug("END pdftotext result =============================")

    logger.debug("Testing {} template files".format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)
        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error("No template for %s", invoicefile)
    return False
Ejemplo n.º 7
0
def extract_multi(file_refrence, catagory):
    pdf_splitter(file_refrence, catagory)
    global total
    pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf"))
    print(pages)
    result = ''
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    for page in pages:
        # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page)
        result += to_table(extract_data(page, templates=templates), page)
    remove_file()
    ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total)
    total = 0
    return ret_data
Ejemplo n.º 8
0
 def __init__(self, invDirectory, filename):
     templates = read_templates(cwd + '/tplf')
     pdfFile = invDirectory + '/' + filename
     result = extract_data(pdfFile, templates=templates)
     print(result)
     print(filename)
     if(result): 
         for item in result.keys():
             self.__setattr__(item, result[item]) 
     else:
         print()
         print()
         print()
         self.__setattr__("issuer", "null")
         self.__setattr__("filename", filename)
Ejemplo n.º 9
0
def get_parsed_data(templates, extracted_str, partiallyExtracted):
    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    if templates is None:
        templates = read_templates()

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str, partiallyExtracted)

    return False
Ejemplo n.º 10
0
def extract_data(invoicefile, templates=None, input_module=pdftotext):
    if templates is None:
        templates = read_templates()

    extracted_str = input_module.to_text(invoicefile).decode('utf-8')

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error('No template for %s', invoicefile)
    return False
Ejemplo n.º 11
0
def thread_fn(file, dummy):
    total_missed = 0
    total_corrected = 0
    total_line_with_issue = 0

    cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"]
    templates = read_templates('./templates')
    result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID)

    total_missed = total_missed + missed
    total_corrected = total_corrected + corrected
    total_line_with_issue = total_line_with_issue + len(issue_lines)

    #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================'
    if missed != 0 or qtyerr != "Match":
        logger.error(CYELLOW + report + CEND)
        issue_list.append(file+"\t"+report)
    else:
        logger.error(CGREEN + report + CEND)
Ejemplo n.º 12
0
def extract_invoice_details(filename):
    if filename != '':
        filename_splitted = filename.split('.')
        # 1.Case for images
        if filename_splitted[-1] != 'pdf':
            # makingSearchablePDF = MakingSearchablePDFs()
            filename = MakingSearchablePDFs.convert_image_to_searchable_pdf(
                filename)
            # filename = convert_image_to_searchable_pdf(filename)
        # 2.Case for pdfs
        elif filename_splitted[-1] == 'pdf':
            pass

        # YAML Template System
        source = 'input/uploads/' + filename
        templates = read_templates('Templates/')
        result = extract_data(source, templates=templates)
        print('\n', type(result))
        print('\n', result)

        # from json import dumps
        # print(dumps(datetime.now(), default=json_serial))
        if result != False:
            destination = 'output/processed/' + filename
            json_data = json.dumps(result,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
            print(type(json_data), json_data)
            # shutil.move(source, destination)
        else:
            destination = 'output/failed/' + filename
            print('Failed for Processing of Invoice!!!')

        # Move processed file to respective actioned folder.
        # shutil.move(source, destination)
        # json_data = json.dumps(result)
        # print('\n', type(json_data))
        with open(destination + name_of_image + '_json.json', 'w') as file:
            file.write(result)
Ejemplo n.º 13
0
import os
from pathlib import Path
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import tesseract4
import xlsxwriter
from my_logger import logger

logger.name = "FX"
work_dir_path = Path(__file__).parent / 'work_dir'
invoice_folder = work_dir_path / "original"
templates = read_templates(work_dir_path / "templates")

# file_name = "invoice-test.pdf"
#file_name = "57001194492019.tiff"
#file_path = work_dir_path / "original" / file_name
#result = extract_data(str(file_path), templates=templates, input_module=tesseract4)
#print(result)


#Excel setting
def no_change(val):
    return val


def to_number(val):
    return float(val.replace(',', ''))


workbook = xlsxwriter.Workbook('invoices.xlsx')
worksheet = workbook.add_worksheet()
Ejemplo n.º 14
0
def extract(name):
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    result = extract_data(name, templates=templates)
    return result
Ejemplo n.º 15
0
from flask import Flask, render_template, request
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import yaml
import glob
import errno
import os
import pymongo
#import PyPDF2

app = Flask(__name__)

read_template = read_templates(folder="invoice_templates")

pdfFiles = []
pdfFiles1 = []
pdfFiles2 = []
filenames = []
path_filename = []

#path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files"
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["pdfinvdata"]
mycol = mydb["invoicedata"]

for filename in os.listdir(
        '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/'
):
    if filename.endswith(".pdf"):
        filenames.append(filename)
        pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str(
Ejemplo n.º 16
0
#!/Library/Frameworks/Python.framework/Versions/3.7/bin/python3

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import sys

file = sys.argv[1]

filename = '/Applications/XAMPP/xamppfiles/htdocs/invoice-app/admin/' + file

templates = read_templates('/Users/krzemson/Desktop/aed/tpl')
result = extract_data(filename, templates=templates)

print(json.dumps(result))


# Importing all the required libraries

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import pdftotext
import pandas as pd

# Importing custom template
templates = read_templates('./template/')

#print(templates)

# Extract data from PDF
result = extract_data('./data/pnlsheet.pdf',
                      templates=templates,
                      input_module=pdftotext)

# Store the extracted data to a Data-frame
df = pd.DataFrame(data=result)

# Export Data-frame to a csv file
df.to_csv('./data/invoice2data_simple.csv')
''' 
You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional
and by default pdftotext will be used if not specified.

The custom template named temp.yml is placed in the templates. You can remove the templates parameter in
extract_data(). Default templates will be used

'''
Ejemplo n.º 18
0
 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()
Ejemplo n.º 19
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import datetime
import argparse

if __name__ == '__main__':
    # Initialize the arguments parser
    parser = argparse.ArgumentParser(description="Extract data from invoices")

    # Add the parameters positional/optional
    parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str)
    parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str)

    # Parse the arguments
    args = parser.parse_args()

    templates = read_templates(args.templates_dirpath)

    output_data = extract_data(args.invoice_path, templates=templates)

    date_time = output_data['date'].strftime('%Y-%m-%d')
    output_data['date'] = date_time
    # then print the formatted JSON data output
    print(json.dumps(output_data, indent=2))
Ejemplo n.º 20
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import os

templates = read_templates('./datasets')
result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)
Ejemplo n.º 21
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import sys

filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf'
#filename = sys.argv[1]
templates = read_templates(
    'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates')
print(templates)

result = extract_data(filename, templates=templates)
#print("\n")
print(result)
#print("Working inside the jojo code")

# Preprocessing: re-arrange and re-formating the extracted output
'''
date = result['date'].strftime('%d, %b %Y')
total = result['total']
invoice_number = result['invoice_number']
addr_from = result['From_Address']
addr_to = result['To_Address']
'''
Ejemplo n.º 22
0
def parse_pdf(pdf_path):
    templates = read_templates('peco_assistant/data/templates')
    results = extract_data(pdf_path, templates=templates)
    return results
Ejemplo n.º 23
0
 def setUp(self):
     self.templates = read_templates()
Ejemplo n.º 24
0
 def setUp(self):
     self.templates = read_templates()
     self.parser = create_parser()
Ejemplo n.º 25
0
 def read_templates(self):
     return read_templates(settings.TEMPLATE_DIR)
Ejemplo n.º 26
0
def extract_data(invoicefile, templates=None, input_module="png", cmdlist=None, conv_cmdlist=None, tid=None):
    """Extracts structured data from PDF/image invoices.
˜
    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    try:
        t = None
        if templates is None:
            templates = read_templates()

        input_module = input_mapping[input_module]
        
        logger.error("Input tid is %s and Input module is %s", tid, input_module)
        for tt in templates:
            if t != None:
                break
            if "tid" in tt.options:
                for tid_option in tt.options["tid"]:
                    if str(tid_option) == tid:
                        t = tt
                        logger.error(f'Template found based on tid {t.options["tid"]} {t["issuer"]}')
                        
                
        if t != None and "psm" in t.options:
            logger.error("PSM is %d", t.options["psm"])
            if str(t.options["psm"]) == "3":
                cmdlist = copy.deepcopy(cmdlist_psm3)
            else:
                cmdlist_psm6[6] = str(t.options["psm"])
                cmdlist = copy.deepcopy(cmdlist_psm6)
        
        if t!=None and "imgcmd" in t.options:
            logger.error("imgcmd is %s", t.options["imgcmd"])
            conv_cmdlist = t.options["imgcmd"]
            
        # print(templates[0])
        extracted_str = input_module.to_text(invoicefile, cmdlist=cmdlist, conv_cmdlist=conv_cmdlist).decode("utf-8")

        logger.debug("START pdftotext result ===========================")
        logger.error(extracted_str)
        logger.debug("END pdftotext result =============================")

        logger.debug("Testing {} template files".format(len(templates)))
        missed = -1
        corrected = -1
        issue_lines = []
        qtyerr = ""
        noofitem = -1
        output = []
        if t == None:
            for t in templates:
                optimized_str = t.prepare_input(extracted_str)

                if t.matches_input(optimized_str):
                    return t.extract(optimized_str)
        else:
            optimized_str = t.prepare_input(extracted_str) 
            output = t.extract(optimized_str)
            if t != None and "decimal" in t.options:
                missed, corrected, issue_lines, qtyerr, noofitem = post_process(output, t.options)
            
            return output, missed, corrected, issue_lines, qtyerr, noofitem

        logger.error("No template for %s", invoicefile)
        return output, missed, corrected, issue_lines, qtyerr, noofitem
    except Exception as ex:
        logger.error("Exception occured in invoice conversion "+ str(ex))

    return False
Ejemplo n.º 27
0
import os
import invoice2data as ntd
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
file_name = 'NT_01.pdf'
temp_name = 'pdf2inv.py'
file_path = os.path.join(
    r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485',
    file_name)
temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates',
                         temp_name)
print(file_path, temp_path)

templates = read_templates(temp_path)
result = extract_data(file_path, templates=templates)
Ejemplo n.º 28
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates

templates = read_templates('test.pdf')
result = extract_data('test.pdf', templates=templates)

# st = "Total: 4.00 4,123.00"
# result = st.split()
# result.pop(1)
# print(" ".join(result))
Ejemplo n.º 29
0
def extract_data(invoicefile, templates=None, input_module=pdftotext, with_extracted_str=False, **kwargs):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    # print(templates[0])
    extracted_str = input_module.to_text(invoicefile, **kwargs).decode('utf-8')
    try:
        extracted_str = extracted_str.replace(u'\xa0', u' ').replace(u"\xc2", " ")
    except:
        pass

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))
    data = data_from_template(templates, extracted_str)
    try:
        if data:
            if with_extracted_str:
                return data, extracted_str
            return data
    except ValueError:
        logger.warning("Exception in parsing", exc_info=True)

    logger.warning('No template for %s', invoicefile)
    if with_extracted_str:
        return False, extracted_str
    return False