Exemple #1
0
class ProcessingSettings:
	#Language = "English,Japanese"
	#OutputFormat = "docx"
	Country               =	str(read_config.get_parameter_values()[0]).replace(" ","")
	ImageSource           = str(read_config.get_parameter_values()[1])
	correctOrientation 	  = str(read_config.get_parameter_values()[2])
	correctSkew 		  = str(read_config.get_parameter_values()[3])
	ExtendedCharacterInfo = str(read_config.get_parameter_values()[4])
	fieldRegionExportMode = str(read_config.get_parameter_values()[5])
#performs cleanup on the working folder, by moving the processed files
# to a folder named with the timestamp

import glob, os
import shutil
import time
import read_config

timestamped_dir = (read_config.get_parameter_values()[11]).format(
    os.getenv('username'), time.strftime("%Y%m%d-%H%M%S"))
converted_pdf = (read_config.get_parameter_values()[14]).format(
    os.getenv('username'))
output_XML = (read_config.get_parameter_values()[12]).format(
    os.getenv('username'))
scanned_PDFs = (read_config.get_parameter_values()[10]).format(
    os.getenv('username'))
separated_receipts = (read_config.get_parameter_values()[9]).format(
    os.getenv('username'))

#creates a folder in
os.mkdir(timestamped_dir)


def cleanup():
    path_ls = [converted_pdf, output_XML, scanned_PDFs, separated_receipts]

    for path in path_ls:
        files = glob.glob(path)
        for file in files:
            shutil.move(file, timestamped_dir, copy_function=shutil.copytree)
Exemple #3
0
class AbbyyOnlineSdk:
	# Warning! This is for easier out-of-the box usage of the sample only. Change to https://
	#  for production use. Change to http://cloud-westus.ocrsdk.com if you created your 
	#  application in US location
	ServerUrl = str(read_config.get_parameter_values()[6])

	# To create an application and obtain a password,
	# register at https://cloud.ocrsdk.com/Account/Register
	# More info on getting your application id and password at
	# https://ocrsdk.com/documentation/faq/#faq3
	ApplicationId = str(read_config.get_parameter_values()[7])
	Password = str(read_config.get_parameter_values()[8])

	Proxies = {
	
		}

	def process_image(self, file_path, settings):

		url_params = {
			"country"						:	settings.Country,
			"ImageSource"					: 	settings.ImageSource,
			"correctOrientation"			:   settings.correctOrientation,
			"correctSkew"					:	settings.correctSkew,
			"xml:writeExtendedCharacterInfo":	settings.ExtendedCharacterInfo,
			"xml:fieldRegionExportMode"     :   settings.fieldRegionExportMode
		}

	
		request_url = self.get_request_url("v2/processReceipt")

		with open(file_path, 'rb') as image_file:
			image_data = image_file.read()

		#s = requests.Session()
		response = requests.post(request_url, data=image_data, params=url_params,
								 auth=(self.ApplicationId, self.Password), proxies=self.Proxies)

		# Any response other than HTTP 200 means error - in this case exception will be thrown
		response.raise_for_status()

		# parse response xml and extract task ID
		task = self.decode_response_JSON(response.text)
		return task

	def get_task_status(self, task):
		if task.Id.find('00000000-0') != -1:
			# GUID_NULL is being passed. This may be caused by a logical error in the calling code
			print("Null task id passed")
			return None

		url_params = {"taskId": task.Id}
		status_url = self.get_request_url("v2/getTaskStatus")

		#s = requests.Session()
		response = requests.get(status_url, params=url_params,
								auth=(self.ApplicationId, self.Password), proxies=self.Proxies)

		task = self.decode_response_JSON(response.text)
		return task

	def download_result(self, task, output_path):
		get_result_url = task.DownloadUrl
		if get_result_url is None:
			print("No download URL found")
			return

		file_response = requests.get(get_result_url, stream=True, proxies=self.Proxies)
		with open(output_path, 'wb') as output_file:
			shutil.copyfileobj(file_response.raw, output_file)

	#original function definition below (from the sample ABBYY code);
	#this is for parsing the HTTP request for the v1\processReceipt API
	def decode_response(self, xml_response):
		""" Decode xml response of the server. Return Task object """
		dom = xml.dom.minidom.parseString(xml_response)
		task_node = dom.getElementsByTagName("task")[0]

		task = Task()
		task.Id = task_node.getAttribute("id")
		task.Status = task_node.getAttribute("status")
		if task.Status == "Completed":
			task.DownloadUrl = task_node.getAttribute("resultUrl")
		return task

	#added function definition for parsing the HTTP request for v2/processReceipt API
	def decode_response_JSON (self, json_response):
		parsed_json = json.loads(json_response)
		print (parsed_json)
		task = Task()
		task.Id = parsed_json["taskId"]
		task.Status = parsed_json["status"]
		print("Task ID :" + str(task.Id),"Task status: " + str(task.Status))

		if task.Status == "Completed":
			task.DownloadUrl = parsed_json["resultUrls"][0]
			print("Task URL: " + str(task.DownloadUrl))
		return task


	def get_request_url(self, url):
		return self.ServerUrl.strip('/') + '/' + url.strip('/')
def separate_receipts():
    def save_asJPG():
        os.chdir(pdf_dir)
        for pdf_file in os.listdir(pdf_dir):

            if pdf_file.endswith(".pdf"):
                pages = convert_from_path(pdf_file, 300)

                filename, extension = os.path.splitext(pdf_file)
                save_to = os.path.join(jpg_dir, "%s.jpg" % (filename))
                for page in pages:
                    page.save(save_to, "JPEG")

    def trimExcessBorder(im):
        bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
        diff = ImageChops.difference(im, bg)
        diff = ImageChops.add(diff, diff, 2.0, -100)
        bbox = diff.getbbox()
        if bbox:
            return im.crop(bbox)

    #this function takes the image, crops excess white border,and
    #processes the image for recognition (until morphological closing)
    def processImage(file):

        scanned_page = cv2.imread(jpg_file)

        scanned_page = cv2.imread(jpg_file)
        scanned_page = scanned_page[10:2440, 30:3480]

        gray = cv2.cvtColor(scanned_page, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
        #second parameter is the kernel size to be convolved
        canny_edged = cv2.Canny(blurred, 70, 250)

        #enlarge the image
        kernel_dilate = np.ones((10, 10), np.uint8)
        dilated = cv2.dilate(canny_edged, kernel_dilate, iterations=1)

        #close gaps between the white pixels
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
        closed = cv2.morphologyEx(dilated, cv2.MORPH_CLOSE, kernel_close)

        #finding the contours
        return scanned_page, closed

    save_asJPG()

    count = 0

    for jpg_file in os.listdir(jpg_dir):

        if jpg_file.endswith("jpg"):
            os.chdir(jpg_dir)

            #cropping excess border from scanned PDFs
            #jpg_file_directory = os.path.join(new_file_directory, jpg_file)
            trimExcessBorder(Image.open(jpg_file)).save(jpg_file)

            scanned_page = processImage(jpg_file)[0]
            closed = processImage(jpg_file)[1]

            (cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL,
                                         cv2.CHAIN_APPROX_SIMPLE)

            for i in cnts:
                peri = cv2.arcLength(i, True)
                approx = cv2.approxPolyDP(i, 0.02 * peri, True)

                x, y, w, h = cv2.boundingRect(i)

                path = (read_config.get_parameter_values()[9]).format(
                    os.getenv('username'))

                if h > 500:
                    individual_receipt = scanned_page[y:y + h, x:x + w]
                    cv2.imwrite(os.path.join(path, '{}.jpg'.format(count)),
                                individual_receipt)

                    count = count + 1
                    print(str(count) + " ----- Saved to {}".format(path))
#the poppler bin directory has to be added in the environment variable Path.

import glob, os
import cv2
import time
import numpy as np
import time

import read_config

from PIL import Image, ImageChops
from pdf2image import convert_from_path

#reading the PDF files in the directory, and then converting each to JPEG files

pdf_dir = (read_config.get_parameter_values()[10]).format(
    os.getenv('username'))
jpg_dir = (read_config.get_parameter_values()[14]).format(
    os.getenv('username'))


def separate_receipts():
    def save_asJPG():
        os.chdir(pdf_dir)
        for pdf_file in os.listdir(pdf_dir):

            if pdf_file.endswith(".pdf"):
                pages = convert_from_path(pdf_file, 300)

                filename, extension = os.path.splitext(pdf_file)
                save_to = os.path.join(jpg_dir, "%s.jpg" % (filename))
import os
import xml.etree.ElementTree as ET
import pandas as pd
from io import StringIO
import time
import read_config

xml_dir = (read_config.get_parameter_values()[12]).format(
    os.getenv('username'))
csv_dir = (read_config.get_parameter_values()[13]).format(
    os.getenv('username'))

os.chdir(xml_dir)

data_lst = []


def parse_and_save():
    for xml_file in os.listdir(xml_dir):
        if xml_file.endswith(".xml"):
            tree = ET.parse(xml_file)
            root = tree.getroot()

            namespace = {
                'xmlns':
                'https://www.abbyy.com/ReceiptCaptureSDK_xml/ReceiptCapture-1.1.xsd'
            }

            for element in root.findall('xmlns:receipt', namespace):
                #print (element.tag, element.attrib)
                try: