def add_invoice(request):
    result = None
    form = None
    if request.method == 'POST':
        form = browse(request.POST, request.FILES)
        if form.is_valid():
            handle_uploaded_file(
                request.FILES['file'])  # store file in upload folder
            path = "pdfextractor/static/upload/" + str(
                request.FILES['file'])  #path of selected file
            result1 = extract_data(path)  # extract data from invoice pdf file
            listl = ['issuer', 'invoice_number', 'date', 'amount', 'currency']
            new_in_dict = {}
            c = {}
            for key in result1:
                if key in listl:
                    new_in_dict[key] = result1[key]
                else:
                    c[key] = result1[key]

            new_in_dict['other'] = c
            result = new_in_dict  # this is final dictionary

    else:
        form = browse()
    context = {"form": form, "result": result}
    return render(request, 'add_invoice.html', context)
Beispiel #2
0
def read_file(filename, debug):
    # If debug is active, get PDF as string for debugging/template creation
    if debug is True:
        with open('INPUT/' + filename, 'rb') as f:
            pdf = pdftotext.PDF(f)
        print('\n\n'.join(pdf))

    templates = read_templates('TEMPLATES/')
    result = extract_data('INPUT/' + filename, templates, pdftotextdef)
    # if pdf read successful write JSON file
    if result != False:
        to_json.write_to_file(
            result, 'OUTPUT/' + os.path.splitext(filename)[0] + '.json',
            '%Y-%m-%d')

        # checks if due_date present in JSON and if not sets due date 1 month after invoice date
        with open('OUTPUT/' + os.path.splitext(filename)[0] + '.json',
                  'r+') as file:
            data = json.load(file)
            if "date_due" not in data:
                date = data["date"]
                date_obj = datetime.strptime(date, '%Y-%m-%d')
                json_in = {
                    "date_due":
                    helper_functions.add_month(date_obj).strftime('%Y-%m-%d')
                }
                data.update(json_in)
                file.seek(0)
                json.dump(data, file, indent=4, sort_keys=True)
    # else add file name to error list and move on
    else:
        helper_functions.append_error(filename)
Beispiel #3
0
def flipkartInvoice(path,request):
    '''Extract flipkartinvoice data'''
    f_invoice = extract_data(str(settings.MEDIA_ROOT+path),input_module=pdftotext)
    insert = Userpdfdata(buyer=f_invoice['issuer'],invoice_number=f_invoice['invoice_number'],seller=f_invoice['issuer'],invoice_date=f_invoice['date'],items=f_invoice['desc'],digitalized="digitized")
    insert.save(force_insert=True)
    digital_data=list(Userpdfdata.objects.all().values_list())
    return digital_data
def extract_invoice(filename):
    template = invoice2data.extract.loader.read_templates(
        "env\\Lib\\site-packages\\invoice2data\\extract\\templates\\max\\")
    result = invoice2data.extract_data(filename, template)
    if result:
        return to_list(result)
    else:
        exit()
Beispiel #5
0
    def parse_pdfs(self) -> List[Dict]:
        result = []
        pdf_list = self.fm.get_pdf_list()
        templates = self.read_templates()

        for pdf in pdf_list:
            data = extract_data(pdf, templates=templates)
            result.append(data)
        return result
Beispiel #6
0
def amazonInvoice(path,request):
    ''' Extract amazoninvoice data'''
    a_invoice = extract_data(str(settings.MEDIA_ROOT+path),input_module=pdftotext)
    desc=""
    for key in a_invoice['lines']:
        for val in key:
            if val=='description':
                desc+=key[val]+","
    insert = Userpdfdata(buyer=a_invoice['issuer'],invoice_number=a_invoice['invoice_number'],seller=a_invoice['partner_name'],invoice_date=a_invoice['date'],items=desc,digitalized="digitized")
    insert.save(force_insert=True)
    digital_data=list(Userpdfdata.objects.all().values_list())
    return digital_data
Beispiel #7
0
def mailExtract(request):
    result = None
    form = None
    if request.method == 'POST':
        form = browse(request.POST, request.FILES)
        if form.is_valid():
            handle_uploaded_file(
                request.FILES['file'])  # store file in upload folder
            path = "invoicedata/static/upload/" + str(
                request.FILES['file'])  #path of selected file
            result = extract_data(path)  # extract data from file
            print(result)
            fields = ['issuer', 'invoice_number', 'date', 'amount', 'currency']
            alist = []
            other = []
            for i in result:
                if i in fields:
                    print(i)
                    if i == 'issuer':
                        alist.insert(0, result[i])
                    elif i == 'invoice_number':
                        alist.insert(1, result[i])
                    elif i == 'date':
                        alist.insert(2, result[i])
                    elif i == 'amount':
                        alist.insert(3, result[i])
                    elif i == 'currency':
                        alist.insert(4, result[i])
                    else:
                        pass
                else:
                    temp = str(str(i) + "-" + str(result[i]))
                    other.append(temp)

            nalist = alist

            nother = Reverse(other)
            print(nalist)
            print(nother)

            p = Invoicelist(issuer=nalist[0],
                            invoice_number=nalist[1],
                            amount=nalist[2],
                            date=nalist[3],
                            currency=nalist[4],
                            other=nother,
                            author=request.user)
            p.save()
    else:
        form = browse()
    context = {"form": form, "result": result}
    return render(request, 'invoicedata/showdownloadedFile1.html', context)
Beispiel #8
0
def extract_multi(file_refrence, catagory):
    pdf_splitter(file_refrence, catagory)
    global total
    pages = glob.glob(r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{}".format("*.pdf"))
    print(pages)
    result = ''
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    for page in pages:
        # path = r"C:\Users\Shahrukh\Desktop\djangofilesupload\filesupload\pdf_processing\{pdf}".format(pdf=page)
        result += to_table(extract_data(page, templates=templates), page)
    remove_file()
    ret_data = '{result}<h3 style="float:right">Total: {total}</h3>'.format(result=result, total=total)
    total = 0
    return ret_data
 def __init__(self, invDirectory, filename):
     templates = read_templates(cwd + '/tplf')
     pdfFile = invDirectory + '/' + filename
     result = extract_data(pdfFile, templates=templates)
     print(result)
     print(filename)
     if(result): 
         for item in result.keys():
             self.__setattr__(item, result[item]) 
     else:
         print()
         print()
         print()
         self.__setattr__("issuer", "null")
         self.__setattr__("filename", filename)
Beispiel #10
0
def index(request):
    result = None
    form = None
    if request.method == 'POST':
        form = browse(request.POST, request.FILES)
        if form.is_valid():
            handle_uploaded_file(
                request.FILES['file'])  # store file in upload folder
            path = "pdfextractor/static/upload/" + str(
                request.FILES['file'])  #path of selected file
            result = extract_data(path)  # extract data from file
    else:
        form = browse()
    context = {"form": form, "result": result}
    return render(request, 'pdfextractor/index.html', context)
Beispiel #11
0
def thread_fn(file, dummy):
    total_missed = 0
    total_corrected = 0
    total_line_with_issue = 0

    cmdlist = ["tesseract", "-c", "tessedit_char_whitelist=/.: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"]
    templates = read_templates('./templates')
    result, missed, corrected, issue_lines, qtyerr, noofitem = extract_data(r+file, input_module=INPUT_MODULE, templates=templates, cmdlist=cmdlist, conv_cmdlist=None, tid = TID)

    total_missed = total_missed + missed
    total_corrected = total_corrected + corrected
    total_line_with_issue = total_line_with_issue + len(issue_lines)

    #logger.error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    report = f'=============================> missed: {missed} corrected: {corrected} line with issue: {len(issue_lines)} Qty issue: {qtyerr} Total Items: {noofitem}<============================'
    if missed != 0 or qtyerr != "Match":
        logger.error(CYELLOW + report + CEND)
        issue_list.append(file+"\t"+report)
    else:
        logger.error(CGREEN + report + CEND)
Beispiel #12
0
def extract_invoice_details(filename):
    if filename != '':
        filename_splitted = filename.split('.')
        # 1.Case for images
        if filename_splitted[-1] != 'pdf':
            # makingSearchablePDF = MakingSearchablePDFs()
            filename = MakingSearchablePDFs.convert_image_to_searchable_pdf(
                filename)
            # filename = convert_image_to_searchable_pdf(filename)
        # 2.Case for pdfs
        elif filename_splitted[-1] == 'pdf':
            pass

        # YAML Template System
        source = 'input/uploads/' + filename
        templates = read_templates('Templates/')
        result = extract_data(source, templates=templates)
        print('\n', type(result))
        print('\n', result)

        # from json import dumps
        # print(dumps(datetime.now(), default=json_serial))
        if result != False:
            destination = 'output/processed/' + filename
            json_data = json.dumps(result,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
            print(type(json_data), json_data)
            # shutil.move(source, destination)
        else:
            destination = 'output/failed/' + filename
            print('Failed for Processing of Invoice!!!')

        # Move processed file to respective actioned folder.
        # shutil.move(source, destination)
        # json_data = json.dumps(result)
        # print('\n', type(json_data))
        with open(destination + name_of_image + '_json.json', 'w') as file:
            file.write(result)
Beispiel #13
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import sys

filename = 'C://Users//Shahrukh//Desktop//document_ai//invoiceX//pdf_input//sample_pg_6.pdf'
#filename = sys.argv[1]
templates = read_templates(
    'C://Users//Shahrukh//Desktop//document_ai//invoiceX//templates')
print(templates)

result = extract_data(filename, templates=templates)
#print("\n")
print(result)
#print("Working inside the jojo code")

# Preprocessing: re-arrange and re-formating the extracted output
'''
date = result['date'].strftime('%d, %b %Y')
total = result['total']
invoice_number = result['invoice_number']
addr_from = result['From_Address']
addr_to = result['To_Address']
'''
Beispiel #14
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import os

templates = read_templates('./datasets')
result = extract_data('datasets/MktPlace-Myntra.pdf', templates=templates)
Beispiel #15
0
def parse_pdf(pdf_path):
    templates = read_templates('peco_assistant/data/templates')
    results = extract_data(pdf_path, templates=templates)
    return results
Beispiel #16
0
    def submit_com():
        filename = input_entry.get()
        status_entry.delete(0, "")
        status_entry.insert(0, "Cheking for file")

        path = folder_entry.get()

        content = extract_data(filename)

        if str(content) == "False":
            status_entry.delete(0, "")
            status_entry.insert(0, "File Error!!!")

        else:
            file_type = list_option.get()
            file_type = file_type.lower()
            file_name = "output." + file_type

            if file_type == "xml":
                content = dicttoxml(content,
                                    custom_root='test',
                                    attr_type=False)

                try:
                    out_file_open = open(join(path, file_name), 'w')
                    out_file_open.write(str(content))
                    out_file_open.close()
                    status_entry.delete(0, "")
                    status_entry.insert(0, "Check Folder")
                    input_entry.delete(0, "")
                    print join(path, file_name)

                except:
                    out_file_open = open((file_name), 'w')
                    out_file_open.write(str(content))
                    out_file_open.close()
                    status_entry.delete(0, "")
                    status_entry.insert(
                        0, ("Successfully written in " + file_name))
                    input_entry.delete(0, "")

            if file_type == "csv":
                try:
                    with open(join(path, file_name),
                              'wb') as file_opner:  # Just use 'w' mode in 3.x
                        file_csv = csv.DictWriter(file_opner, content.keys())
                        file_csv.writeheader()
                        file_csv.writerow(content)
                    status_entry.delete(0, "")
                    status_entry.insert(0, "Check Folder")
                    input_entry.delete(0, "")

                except:
                    with open((file_name),
                              'w') as file_opner:  # Just use 'w' mode in 3.x
                        file_csv = csv.DictWriter(file_opner, content.keys())
                        file_csv.writeheader()
                        file_csv.writerow(content)
                    status_entry.delete(0, "")
                    status_entry.insert(
                        0, ("Successfully written in " + file_name))
                    input_entry.delete(0, "")

            if file_type == "json":
                try:
                    out_file_open = open(join(path, file_name), 'w')
                    out_file_open.write(str(content))
                    out_file_open.close()
                    status_entry.delete(0, "")
                    status_entry.insert(0, "Check floder")
                    input_entry.delete(0, "")

                except:
                    out_file_open = open((file_name), 'w')
                    out_file_open.write(str(content))
                    out_file_open.close()
                    status_entry.delete(0, "")
                    status_entry.insert(
                        0, ("Successfully written in " + file_name))
                    input_entry.delete(0, "")
Beispiel #17
0
    "foreign_bruto": (worksheet.write_number, to_number),
    "local_base": (worksheet.write_number, to_number),
    "local_vat": (worksheet.write_number, to_number),
    "local_bruto": (worksheet.write_number, to_number),
    "exchange_rate": (worksheet.write_number, to_number)
}

#OCR
count = 1
total = len([f for f in invoice_folder.iterdir()])
for invoice in invoice_folder.iterdir():
    try:
        logger.info(" {} Processing ... {} / {}".format(
            invoice.name, count, total))
        result = extract_data(str(invoice),
                              templates=templates,
                              input_module=tesseract4)
        if not result:
            result = {"document_number": str(invoice.name)}
            logger.info(" {} - No template".format(invoice.name))
        else:
            result["document_number"] = invoice.name

        results.append(result)
        count += 1
    except Exception as e:
        print('Error:', e)
        logger.exception(" {} - Exception occurred...".format(invoice.name))
        continue

try:
Beispiel #18
0
import os
import invoice2data as ntd
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
file_name = 'NT_01.pdf'
temp_name = 'pdf2inv.py'
file_path = os.path.join(
    r'C:\Users\fkhalil\primeStone\docrecog\sampleDocs\EURO DIESEL\B1959500485',
    file_name)
temp_path = os.path.join(r'C:\Users\fkhalil\primeStone\docrecog\templates',
                         temp_name)
print(file_path, temp_path)

templates = read_templates(temp_path)
result = extract_data(file_path, templates=templates)
# Importing all the required libraries

from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
from invoice2data.input import pdftotext
import pandas as pd

# Importing custom template
templates = read_templates('./template/')

#print(templates)

# Extract data from PDF
result = extract_data('./data/pnlsheet.pdf',
                      templates=templates,
                      input_module=pdftotext)

# Store the extracted data to a Data-frame
df = pd.DataFrame(data=result)

# Export Data-frame to a csv file
df.to_csv('./data/invoice2data_simple.csv')
''' 
You can use any desired library to extract data from pdftotext, pdftotext, pdfminer, tesseract. It is optional
and by default pdftotext will be used if not specified.

The custom template named temp.yml is placed in the templates. You can remove the templates parameter in
extract_data(). Default templates will be used

'''
#path = "/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files"
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["pdfinvdata"]
mycol = mydb["invoicedata"]

for filename in os.listdir(
        '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/'
):
    if filename.endswith(".pdf"):
        filenames.append(filename)
        pathname = '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/' + str(
            filename)
        path_filename.append(pathname)
        result = extract_data(
            '/home/taher/Desktop/iconnect-opensource-invoice_reader_ai-b82f8f46c28c/static/pdf_files/'
            + filename, read_template)
        pdfFiles1.append(result)
        #print(type(result))
        #print("----------------------")
        if (result == False):
            pdfFiles.append(filename)
        else:
            pdfFiles.append(result)

#pdfFiles.sort()
#print(pdfFiles)
#print(filenames)
#print(path_filename)

i = 0
Beispiel #21
0
import json, pandas
import csv, os
import sys, subprocess

HOME = os.environ['HOME']
template_folder = sys.argv[2].strip()
invoices_folder = sys.argv[1].strip()
invoice_files = [
    os.path.join(invoices_folder, f) for f in os.listdir(invoices_folder)
    if os.path.isfile(os.path.join(invoices_folder, f))
]
print('Template:', template_folder)
for each in invoice_files:
    if not '.pdf' in each:
        continue
    result = extract_data(each)
    if not result:
        cmd = ['invoice2data', '--template-folder', template_folder, each]
        session = subprocess.Popen(cmd,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        result = session.communicate()[0].decode('UTF-8')
        print(result)
        result_json = json.loads(result)
        print(result_json)
    #result = extract_data('/Users/hithyshikrishnamurthy/Downloads/Pearson_Converted_3.split/Pearson_Converted_3.1.pdf')
    csvfile_name = HOME + '/Desktop/invoice_details.csv'
    first_write = True

    if os.path.exists(csvfile_name):
        first_write = False
Beispiel #22
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates
import json
import datetime
import argparse

if __name__ == '__main__':
    # Initialize the arguments parser
    parser = argparse.ArgumentParser(description="Extract data from invoices")

    # Add the parameters positional/optional
    parser.add_argument('-t','--templates_dirpath', help="Templates directory path", type=str)
    parser.add_argument('-i','--invoice_path', help="Invoice file path", type=str)

    # Parse the arguments
    args = parser.parse_args()

    templates = read_templates(args.templates_dirpath)

    output_data = extract_data(args.invoice_path, templates=templates)

    date_time = output_data['date'].strftime('%Y-%m-%d')
    output_data['date'] = date_time
    # then print the formatted JSON data output
    print(json.dumps(output_data, indent=2))
Beispiel #23
0
def extract(name):
    templates = read_templates(r'C:\Users\Shahrukh\Desktop\djangofilesupload\MlEngine\invoiceX\templates')
    result = extract_data(name, templates=templates)
    return result
Beispiel #24
0
from invoice2data import extract_data
from invoice2data.extract.loader import read_templates

templates = read_templates('test.pdf')
result = extract_data('test.pdf', templates=templates)

# st = "Total: 4.00 4,123.00"
# result = st.split()
# result.pop(1)
# print(" ".join(result))