class ProcessingSettings: #Language = "English,Japanese" #OutputFormat = "docx" Country = str(read_config.get_parameter_values()[0]).replace(" ","") ImageSource = str(read_config.get_parameter_values()[1]) correctOrientation = str(read_config.get_parameter_values()[2]) correctSkew = str(read_config.get_parameter_values()[3]) ExtendedCharacterInfo = str(read_config.get_parameter_values()[4]) fieldRegionExportMode = str(read_config.get_parameter_values()[5])
#performs cleanup on the working folder, by moving the processed files # to a folder named with the timestamp import glob, os import shutil import time import read_config timestamped_dir = (read_config.get_parameter_values()[11]).format( os.getenv('username'), time.strftime("%Y%m%d-%H%M%S")) converted_pdf = (read_config.get_parameter_values()[14]).format( os.getenv('username')) output_XML = (read_config.get_parameter_values()[12]).format( os.getenv('username')) scanned_PDFs = (read_config.get_parameter_values()[10]).format( os.getenv('username')) separated_receipts = (read_config.get_parameter_values()[9]).format( os.getenv('username')) #creates a folder in os.mkdir(timestamped_dir) def cleanup(): path_ls = [converted_pdf, output_XML, scanned_PDFs, separated_receipts] for path in path_ls: files = glob.glob(path) for file in files: shutil.move(file, timestamped_dir, copy_function=shutil.copytree)
class AbbyyOnlineSdk: # Warning! This is for easier out-of-the box usage of the sample only. Change to https:// # for production use. Change to http://cloud-westus.ocrsdk.com if you created your # application in US location ServerUrl = str(read_config.get_parameter_values()[6]) # To create an application and obtain a password, # register at https://cloud.ocrsdk.com/Account/Register # More info on getting your application id and password at # https://ocrsdk.com/documentation/faq/#faq3 ApplicationId = str(read_config.get_parameter_values()[7]) Password = str(read_config.get_parameter_values()[8]) Proxies = { } def process_image(self, file_path, settings): url_params = { "country" : settings.Country, "ImageSource" : settings.ImageSource, "correctOrientation" : settings.correctOrientation, "correctSkew" : settings.correctSkew, "xml:writeExtendedCharacterInfo": settings.ExtendedCharacterInfo, "xml:fieldRegionExportMode" : settings.fieldRegionExportMode } request_url = self.get_request_url("v2/processReceipt") with open(file_path, 'rb') as image_file: image_data = image_file.read() #s = requests.Session() response = requests.post(request_url, data=image_data, params=url_params, auth=(self.ApplicationId, self.Password), proxies=self.Proxies) # Any response other than HTTP 200 means error - in this case exception will be thrown response.raise_for_status() # parse response xml and extract task ID task = self.decode_response_JSON(response.text) return task def get_task_status(self, task): if task.Id.find('00000000-0') != -1: # GUID_NULL is being passed. This may be caused by a logical error in the calling code print("Null task id passed") return None url_params = {"taskId": task.Id} status_url = self.get_request_url("v2/getTaskStatus") #s = requests.Session() response = requests.get(status_url, params=url_params, auth=(self.ApplicationId, self.Password), proxies=self.Proxies) task = self.decode_response_JSON(response.text) return task def download_result(self, task, output_path): get_result_url = task.DownloadUrl if get_result_url is None: print("No download URL found") return file_response = requests.get(get_result_url, stream=True, proxies=self.Proxies) with open(output_path, 'wb') as output_file: shutil.copyfileobj(file_response.raw, output_file) #original function definition below (from the sample ABBYY code); #this is for parsing the HTTP request for the v1\processReceipt API def decode_response(self, xml_response): """ Decode xml response of the server. Return Task object """ dom = xml.dom.minidom.parseString(xml_response) task_node = dom.getElementsByTagName("task")[0] task = Task() task.Id = task_node.getAttribute("id") task.Status = task_node.getAttribute("status") if task.Status == "Completed": task.DownloadUrl = task_node.getAttribute("resultUrl") return task #added function definition for parsing the HTTP request for v2/processReceipt API def decode_response_JSON (self, json_response): parsed_json = json.loads(json_response) print (parsed_json) task = Task() task.Id = parsed_json["taskId"] task.Status = parsed_json["status"] print("Task ID :" + str(task.Id),"Task status: " + str(task.Status)) if task.Status == "Completed": task.DownloadUrl = parsed_json["resultUrls"][0] print("Task URL: " + str(task.DownloadUrl)) return task def get_request_url(self, url): return self.ServerUrl.strip('/') + '/' + url.strip('/')
def separate_receipts(): def save_asJPG(): os.chdir(pdf_dir) for pdf_file in os.listdir(pdf_dir): if pdf_file.endswith(".pdf"): pages = convert_from_path(pdf_file, 300) filename, extension = os.path.splitext(pdf_file) save_to = os.path.join(jpg_dir, "%s.jpg" % (filename)) for page in pages: page.save(save_to, "JPEG") def trimExcessBorder(im): bg = Image.new(im.mode, im.size, im.getpixel((0, 0))) diff = ImageChops.difference(im, bg) diff = ImageChops.add(diff, diff, 2.0, -100) bbox = diff.getbbox() if bbox: return im.crop(bbox) #this function takes the image, crops excess white border,and #processes the image for recognition (until morphological closing) def processImage(file): scanned_page = cv2.imread(jpg_file) scanned_page = cv2.imread(jpg_file) scanned_page = scanned_page[10:2440, 30:3480] gray = cv2.cvtColor(scanned_page, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray, (3, 3), 0) #second parameter is the kernel size to be convolved canny_edged = cv2.Canny(blurred, 70, 250) #enlarge the image kernel_dilate = np.ones((10, 10), np.uint8) dilated = cv2.dilate(canny_edged, kernel_dilate, iterations=1) #close gaps between the white pixels kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) closed = cv2.morphologyEx(dilated, cv2.MORPH_CLOSE, kernel_close) #finding the contours return scanned_page, closed save_asJPG() count = 0 for jpg_file in os.listdir(jpg_dir): if jpg_file.endswith("jpg"): os.chdir(jpg_dir) #cropping excess border from scanned PDFs #jpg_file_directory = os.path.join(new_file_directory, jpg_file) trimExcessBorder(Image.open(jpg_file)).save(jpg_file) scanned_page = processImage(jpg_file)[0] closed = processImage(jpg_file)[1] (cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for i in cnts: peri = cv2.arcLength(i, True) approx = cv2.approxPolyDP(i, 0.02 * peri, True) x, y, w, h = cv2.boundingRect(i) path = (read_config.get_parameter_values()[9]).format( os.getenv('username')) if h > 500: individual_receipt = scanned_page[y:y + h, x:x + w] cv2.imwrite(os.path.join(path, '{}.jpg'.format(count)), individual_receipt) count = count + 1 print(str(count) + " ----- Saved to {}".format(path))
#the poppler bin directory has to be added in the environment variable Path. import glob, os import cv2 import time import numpy as np import time import read_config from PIL import Image, ImageChops from pdf2image import convert_from_path #reading the PDF files in the directory, and then converting each to JPEG files pdf_dir = (read_config.get_parameter_values()[10]).format( os.getenv('username')) jpg_dir = (read_config.get_parameter_values()[14]).format( os.getenv('username')) def separate_receipts(): def save_asJPG(): os.chdir(pdf_dir) for pdf_file in os.listdir(pdf_dir): if pdf_file.endswith(".pdf"): pages = convert_from_path(pdf_file, 300) filename, extension = os.path.splitext(pdf_file) save_to = os.path.join(jpg_dir, "%s.jpg" % (filename))
import os import xml.etree.ElementTree as ET import pandas as pd from io import StringIO import time import read_config xml_dir = (read_config.get_parameter_values()[12]).format( os.getenv('username')) csv_dir = (read_config.get_parameter_values()[13]).format( os.getenv('username')) os.chdir(xml_dir) data_lst = [] def parse_and_save(): for xml_file in os.listdir(xml_dir): if xml_file.endswith(".xml"): tree = ET.parse(xml_file) root = tree.getroot() namespace = { 'xmlns': 'https://www.abbyy.com/ReceiptCaptureSDK_xml/ReceiptCapture-1.1.xsd' } for element in root.findall('xmlns:receipt', namespace): #print (element.tag, element.attrib) try: