Example #1
0
def main(args=None):
    """Take folder or single file and analyze each."""

    # Extract Args
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    # Configure Debug
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    # Load Templates
    templates = read_templates()

    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Extracting data
    output = []
    for f in args.input_files:
        input_module = select_input_module(f.name)
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        res["file_name"] = f.name
        if res:
            logger.info(res)
            output.append(res)
        f.close()

    # Writing Output
    # TODO: fix output file
    to_json.write_to_file(output, args.output_name, args.output_date_format)
Example #2
0
def extract_data(invoicefile, templates=None, input_module=pdftotext):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Args:
        invoicefile (`Str`):
            path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
        templates (`list`, optional, default: `None`):
            Templates are loaded using `read_template` function in `loader.py`
        input_module (:obj:`{pdftotext, tesseract}`, optional, default:`pdftotext`):
            library to be used to extract text from given `invoicefile`,
    Returns:
        dict or False
            extracted and matched fields or False if no template matches
    """
    # Loading Templates if None
    if templates is None:
        templates = read_templates()
    # Extracting text
    extracted_str = input_module.to_text(invoicefile).decode("utf-8")
    # Logging extracted text
    logger.debug("START pdftotext result ===========================")
    logger.debug(extracted_str)
    logger.debug("END pdftotext result =============================")
    logger.debug("Testing {} template files".format(len(templates)))
    # iterate through all templates to find suitable template.
    for template in templates:
        # preprocess input
        optimized_str = template.prepare_input(extracted_str)
        # extract if keywords match
        if template.match_keywords(optimized_str):
            return template.extract_info(optimized_str)
    logger.error("No template for %s", invoicefile)
    return False
Example #3
0
def extract_data(invoicefile, templates=None, input_module=pdftotext):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    # print(templates[0])
    extracted_str = input_module.to_text(invoicefile).decode('utf-8')
    tried_tesseract = False
    if len(extracted_str) < 20 and input_module == pdftotext:
        extracted_str = tesseract.to_text(invoicefile).decode('utf-8')
        tried_tesseract = True

    logger.debug('START pdftotext result ===========================')
    logger.debug(extracted_str)
    logger.debug('END pdftotext result =============================')

    logger.debug('Testing {} template files'.format(len(templates)))

    #get page count of invoicefile
    pdf = None
    pageCount = None
    try:
        pdf = PdfFileReader(open(invoicefile, 'rb'))
        pageCount = pdf.getNumPages()
    except Exception as e:
        #print(e.message)
        logger.error(str(e))
        pass
    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            if pageCount is not None and pageCount > 1:
                #dealing with pages
                confirmInTemplate = False
                for k, v in t['fields'].items():
                    if k == 'multiple_page' and v == 'True':
                        confirmInTemplate = True
                        break
                if confirmInTemplate:
                    try:
                        pdfdirectory = os.path.dirname(invoicefile)
                        pdfname = os.path.basename(invoicefile)
                        for i in range(pdf.numPages):
                            #split multi-page pdf file into multiple pdf files
                            output = PdfFileWriter()
                            output.addPage(pdf.getPage(i))
                            objectfile = join(
                                pdfdirectory,
                                pdfname.replace('.pdf', '').replace(
                                    '.PDF', '') + '_' + str(i) + '.pdf')
                            with open(objectfile, "wb") as outputStream:
                                output.write(outputStream)
                        os.remove(invoicefile)
                        logger.warning(
                            'Seperate pdf into multiple files, process in next scanning loop'
                        )
                        return 'pdf seperated'
                    except Exception as e:
                        #print(e.message)
                        logger.error(str(e))
                        pass

            ret = t.extract(optimized_str, invoicefile)
            if ret is not None and ret is not False:
                return ret

    if not tried_tesseract:
        logger.debug(
            'No template matched, now try tesseract ==========================='
        )
        tried_tesseract = True
        extracted_str2 = tesseract.to_text(invoicefile).decode('utf-8')
        logger.debug('START tesseract result ===========================')
        logger.debug(extracted_str2)
        logger.debug('END tesseract result =============================')
        logger.debug('Testing {} template files'.format(len(templates)))
        for t in templates:
            optimized_str2 = t.prepare_input(extracted_str2)

            if t.matches_input(optimized_str2):
                tesseract_result = t.extract(optimized_str2, invoicefile)
                if tesseract_result is None:
                    #tesseract find the right template, but do not match all required fields
                    #so use this template and use pdf2text string
                    optimized_str = t.prepare_input(extracted_str)
                    return t.extract(optimized_str, invoicefile)
                else:
                    return tesseract_result
    #print('No template for ' + invoicefile)
    logger.error('No template for %s', invoicefile)
    return False
Example #4
0
def main2(args=None):
    """Take folder or single file and analyze each."""

    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    args['output_date_format'] = '%Y-%m-%d'
    input_module = input_mapping['pdftotext']
    output_module = output_mapping[args['output_format']]

    templates = []
    # Load templates from external folder if set.
    # if args['template_folder']:
    #     templates += read_templates(os.path.abspath(args['template_folder']))

    # Load internal templates, if not disabled.
    # if not args['exclude_built_in_templates']:
    if 'template_folder' in args:
        templates += read_templates(os.path.abspath(args['template_folder']))
    else:
        templates += read_templates()
    output = []
    for fs in args['input_files']:
        f = open(fs, 'r')
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        if res == 'pdf seperated':
            continue
        re = None
        if res:
            logger.info(res)
            output.append(res)
            if args['dbpass'] is not None:
                re = output_module.write_to_db(
                    res, f.name, args['output_date_format'], args['dbhost'],
                    args['dbuser'], args['dbpass'], args['dbname'],
                    args['azure_account'], args['azure_key'], args['pdf_path'])

        f.close()
        if args['dbpass'] is not None:
            #move source pdf
            pdfdirectory = os.path.dirname(f.name)
            pdfpath = f.name
            pdfname = os.path.basename(f.name)
            if re == 'succeed':
                #move to successful
                if args['pdf_succeed']:
                    succeed_path = args['pdf_succeed']
                else:
                    #succeed_path = join(pdfdirectory, 'successful')
                    #move to public successful folder where clients can access
                    succeed_path = os.path.abspath(
                        os.path.join(pdfdirectory, os.pardir))
                    succeed_path = join(succeed_path, 'successful')

                from datetime import date
                succeed_path = join(succeed_path,
                                    date.today().strftime('%d-%m-%Y'))
                try:
                    if not os.path.exists(succeed_path):
                        os.makedirs(succeed_path)
                    destinateFile = join(succeed_path, pdfname)
                    shutil.move(pdfpath, destinateFile)
                except:
                    if args['pdf_moved_failed']:
                        succeed_path = args['pdf_moved_failed']
                    else:
                        succeed_path = join(pdfdirectory, 'failedToMove')
                    succeed_path = join(succeed_path,
                                        date.today().strftime('%d-%m-%Y'))
                    if not os.path.exists(succeed_path):
                        os.makedirs(succeed_path)
                    destinateFile = join(succeed_path, pdfname)
                    shutil.move(pdfpath, destinateFile)
                pass
            elif re == 'link db failed':
                pass
            elif re == 'exists':
                #delete
                print('data already exists in edms: ' + pdfname)
                os.remove(pdfpath)
                pass
            else:
                #move to failed
                if args['pdf_failed']:
                    failed_path = args['pdf_failed']
                else:
                    father_path = os.path.abspath(
                        os.path.join(pdfdirectory, os.pardir))
                    failed_path = join(father_path, 'failed')
                if not os.path.exists(failed_path):
                    os.makedirs(failed_path)
                destinateFile = join(failed_path, pdfname)
                shutil.move(pdfpath, destinateFile)
                pass

    if output_module is not None:
        if args['dbpass'] is not None:
            pass  #for data base output, do it in loop of extracting
        else:
            logger.warning(output)
            output_module.write_to_file(output, args['output_name'],
                                        args['output_date_format'])
    return res
Example #5
0
def main(args=None):
    """Take folder or single file and analyze each."""

    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    input_module = input_mapping[args.input_reader]
    output_module = output_mapping[args.output_format]

    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []
    for f in args.input_files:
        res = extract_data(f.name,
                           templates=templates,
                           input_module=input_module)
        if res == 'pdf seperated':
            continue
        re = None
        if res:
            logger.info(res)
            output.append(res)
            if args.dbpass is not None:
                re = output_module.write_to_db(res, f.name,
                                               args.output_date_format,
                                               args.dbhost, args.dbuser,
                                               args.dbpass, args.dbname,
                                               args.azure_account,
                                               args.azure_key, args.pdf_path)

            if args.copy:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.copyfile(f.name, join(args.copy, filename))
            if args.move:
                filename = args.filename.format(
                    date=res['date'].strftime('%Y-%m-%d'),
                    invoice_number=res['invoice_number'],
                    desc=res['desc'],
                )
                shutil.move(f.name, join(args.move, filename))
        f.close()
        if args.dbpass is not None:
            #move source pdf
            pdfdirectory = os.path.dirname(f.name)  #failedTemp
            pdfpath = f.name
            pdfname = os.path.basename(f.name)
            if re == 'succeed':

                #move to successful
                #succeed_path = join(pdfdirectory, 'successful')
                #move to public successful folder where clients can access
                succeed_path = os.path.abspath(
                    os.path.join(pdfdirectory, os.pardir))
                succeed_path = join(succeed_path, 'successful')

                from datetime import date
                succeed_path = join(succeed_path,
                                    date.today().strftime('%d-%m-%Y'))

                if not os.path.exists(succeed_path):
                    os.makedirs(succeed_path)
                destinateFile = join(succeed_path, pdfname)
                shutil.move(pdfpath, destinateFile)
                pass
            elif re == 'link db failed':
                pass
            elif re == 'exists':
                #delete
                os.remove(pdfpath)
                pass

            else:
                #move to failed
                failed_path = join(pdfdirectory, 'failed')
                if not os.path.exists(failed_path):
                    os.makedirs(failed_path)
                destinateFile = join(failed_path, pdfname)
                shutil.move(pdfpath, destinateFile)
                pass

    if output_module is not None:
        if args.dbpass is not None:
            pass  #for data base output, do it in loop of extracting
        else:
            output_module.write_to_file(output, args.output_name,
                                        args.output_date_format)
Example #6
0
def extract_data(invoicefile, templates=None, input_module_lang="deu"):
    """Extracts structured data from PDF/image invoices.

    This function uses the text extracted from a PDF file or image and
    pre-defined regex templates to find structured data.

    Reads template if no template assigned
    Required fields are matches from templates

    Parameters
    ----------
    invoicefile : str
        path of electronic invoice file in PDF,JPEG,PNG (example: "/home/duskybomb/pdf/invoice.pdf")
    templates : list of instances of class `InvoiceTemplate`, optional
        Templates are loaded using `read_template` function in `loader.py`
    input_module : {'pdftotext', 'pdfminer', 'tesseract'}, optional
        library to be used to extract text from given `invoicefile`,

    Returns
    -------
    dict or False
        extracted and matched fields or False if no template matches

    Notes
    -----
    Import required `input_module` when using invoice2data as a library

    See Also
    --------
    read_template : Function where templates are loaded
    InvoiceTemplate : Class representing single template files that live as .yml files on the disk

    Examples
    --------
    When using `invoice2data` as an library

    >>> from invoice2data.input import pdftotext
    >>> extract_data("invoice2data/test/pdfs/oyo.pdf", None, pdftotext)
    {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
     'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}

    """
    if templates is None:
        templates = read_templates()

    tesseracted = False
    extracted_str = pdftotext.to_text(invoicefile,
                                      input_module_lang).decode("utf-8")
    if extracted_str.strip() == "":
        logger.debug("No extractable text, running OCR...")
        extracted_str = tesseract.to_text(invoicefile,
                                          input_module_lang).decode("utf-8")
        tesseracted = True

    logger.debug("START text result ===========================")
    logger.debug(extracted_str)
    logger.debug("END text result =============================")

    logger.debug("Testing {} template files".format(len(templates)))

    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    if tesseracted:
        logger.error("No template for %s", invoicefile)
        return False

    logger.debug("No template match! Re-reading...")
    extracted_str = tesseract.to_text(invoicefile,
                                      input_module_lang).decode("utf-8")

    logger.debug("START tesseract result ===========================")
    logger.debug(extracted_str)
    logger.debug("END tesseract result =============================")

    logger.debug("Testing {} template files".format(len(templates)))

    for t in templates:
        optimized_str = t.prepare_input(extracted_str)

        if t.matches_input(optimized_str):
            return t.extract(optimized_str)

    logger.error("No template for %s", invoicefile)
    return False
Example #7
0
def main(args=None):
    """Take folder or single file and analyze each."""
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    input_module = input_mapping[args.input_reader]

    input_module_lang = "deu"
    if args.input_reader_lang:
        input_module_lang = args.input_reader_lang

    output_module = output_mapping[args.output_format]

    templates = []
    # Load templates from external folder if set.
    if args.template_folder:
        templates += read_templates(os.path.abspath(args.template_folder))

    # Load internal templates, if not disabled.
    if not args.exclude_built_in_templates:
        templates += read_templates()
    output = []
    for f in args.input_files:
        res = extract_data(f.name,
                           templates=templates,
                           input_module_lang=input_module_lang)
        if res:
            logger.info(res)
            output.append(res)
            if args.copy:
                filename = args.filename.format(
                    date=res["date"].strftime("%Y-%m-%d"),
                    invoice_number=makeFilename(res["invoice_number"]),
                    desc=makeFilename(res["desc"]),
                )

                newName = join(args.copy, filename)
                shutil.copyfile(f.name, newName)

                if args.anonymize:
                    from anonymize import anonymize
                    anonymize(newName, args.anonymize)

            if args.move:
                filename = args.filename.format(
                    date=res["date"].strftime("%Y-%m-%d"),
                    invoice_number=makeFilename(res["invoice_number"]),
                    desc=makeFilename(res["desc"]),
                )

                newName = join(args.move, filename)
                shutil.move(f.name, join(args.move, filename))

                if args.anonymize:
                    from anonymize import anonymize
                    anonymize(newName, args.anonymize)
        f.close()

    if output_module is not None:
        output_module.write_to_file(output, args.output_name,
                                    args.output_date_format)