def load_file(path): image = pdf2image.convert_from_path(path)[0] height = image.size[1] width = image.size[0] ngrams = util.create_ngrams(image) for ngram in ngrams: if "amount" in ngram["parses"]: ngram["parses"]["amount"] = util.normalize( ngram["parses"]["amount"], key="amount") if "date" in ngram["parses"]: ngram["parses"]["date"] = util.normalize(ngram["parses"]["date"], key="date") fields = {field: '0' for field in FIELDS} page = { "fields": fields, "nGrams": ngrams, "height": height, "width": width, "filename": path } return {'image': image, 'page': page}
def process_file(filename, out_dir, phase): try: page = pdf2image.convert_from_path(filename)[0] page.save( os.path.join(out_dir, phase, os.path.basename(filename)[:-3] + 'png')) height = page.size[1] width = page.size[0] ngrams = util.create_ngrams(page) for ngram in ngrams: if "amount" in ngram["parses"]: ngram["parses"]["amount"] = util.normalize( ngram["parses"]["amount"], key="amount") if "date" in ngram["parses"]: ngram["parses"]["date"] = util.normalize( ngram["parses"]["date"], key="date") with open(filename[:-3] + 'json', 'r') as fp: labels = simplejson.loads(fp.read()) fields = {} for field in FIELDS: if field in labels: if FIELDS[field] == FIELD_TYPES["amount"]: fields[field] = util.normalize(labels[field], key="amount") elif FIELDS[field] == FIELD_TYPES["date"]: fields[field] = util.normalize(labels[field], key="date") else: fields[field] = labels[field] else: fields[field] = '' data = { "fields": fields, "nGrams": ngrams, "height": height, "width": width, "filename": os.path.abspath( os.path.join(out_dir, phase, os.path.basename(filename)[:-3] + 'png')) } with open( os.path.join(out_dir, phase, os.path.basename(filename)[:-3] + 'json'), 'w') as fp: fp.write(simplejson.dumps(data, indent=2)) return True except Exception as exp: print("Skipping {} : {}".format(filename, exp)) return False
def _prepare_data(self): self._get_inputs() if self.args["data_dir"] == '': messagebox.showerror("Error", "No files were selected!") return if not os.path.exists(self.args["data_dir"]): messagebox.showerror("Error", "No files were selected!") return self.progressbar["value"] = 0 self.progress_label.configure(text="Preparing Data:") data_dir = os.path.join(self.args["prepared_data"], 'predict') os.makedirs(data_dir, exist_ok=True) filenames = [os.path.abspath(f) for f in glob.glob(data_dir + "**/*.json", recursive=True)] filenames += [os.path.abspath(f) for f in glob.glob(data_dir + "**/*.png", recursive=True)] for f in filenames: os.remove(f) filenames = [] if self.args["data_dir"] and os.path.exists(self.args["data_dir"]): filenames = [os.path.abspath(f) for f in glob.glob(self.args["data_dir"] + "**/*.pdf", recursive=True)] if self.args["data_file"] and os.path.exists(self.args["data_file"]): filenames += [self.args["data_file"]] self.logger.log("Total: {}".format(len(filenames))) self.logger.log("Preparing data for extraction...") total_samples = len(filenames) sample_idx = 0 for filename in tqdm(filenames): try: page = pdf2image.convert_from_path(filename)[0] page.save(os.path.join(data_dir, os.path.basename(filename)[:-3] + 'png')) height = page.size[1] width = page.size[0] ngrams = util.create_ngrams(page) for ngram in ngrams: if "amount" in ngram["parses"]: ngram["parses"]["amount"] = util.normalize(ngram["parses"]["amount"], key="amount") if "date" in ngram["parses"]: ngram["parses"]["date"] = util.normalize(ngram["parses"]["date"], key="date") fields = {field: '0' for field in FIELDS} data = { "fields": fields, "nGrams": ngrams, "height": height, "width": width, "filename": os.path.abspath(os.path.join(data_dir, os.path.basename(filename)[:-3] + 'png')) } with open(os.path.join(data_dir, os.path.basename(filename)[:-3] + 'json'), 'w') as fp: fp.write(simplejson.dumps(data, indent=2)) except Exception as exp: self.logger.log("Skipping {} : {}".format(filename, exp)) sample_idx += 1 self.progress_label.configure(text="Preparing data [{}/{}]:".format(sample_idx, total_samples)) self.progressbar["value"] = (sample_idx / total_samples) * 100 self.progressbar.update() self.progress_label.configure(text="Completed!") self.progressbar["value"] = 100 self.progressbar.update() self.logger.log("Prepared data stored in '{}'".format(data_dir))
def main(): ap = argparse.ArgumentParser() ap.add_argument( "--data_dir", type=str, required=True, help="path to directory containing invoice document images") ap.add_argument("--out_dir", type=str, default='processed_data/', help="path to save prepared data") ap.add_argument("--val_size", type=float, default=0.2, help="validation split ration") args = ap.parse_args() os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True) os.makedirs(os.path.join(args.out_dir, 'val'), exist_ok=True) # filenames = [os.path.abspath(f) for f in glob.glob(args.data_dir + "**/*.pdf", recursive=True)] filenames = [ os.path.abspath(item) for sublist in [ glob.glob(args.data_dir + ext, recursive=True) for ext in ["**/*.pdf", "**/*.jpg", "**/*.png"] ] for item in sublist ] idx = int(len(filenames) * args.val_size) train_files = filenames[idx:] val_files = filenames[:idx] print("Total: {}".format(len(filenames))) print("Training: {}".format(len(train_files))) print("Validation: {}".format(len(val_files))) for phase, filenames in [('train', train_files), ('val', val_files)]: print("Preparing {} data...".format(phase)) for filename in tqdm(filenames): try: page = pdf2image.convert_from_path(filename, dpi=500)[0] page.save( os.path.join(args.out_dir, phase, os.path.basename(filename)[:-3] + 'png')) height = page.size[1] width = page.size[0] ngrams = util.create_ngrams(page) for ngram in ngrams: if "amount" in ngram["parses"]: ngram["parses"]["amount"] = util.normalize( ngram["parses"]["amount"], key="amount") if "date" in ngram["parses"]: ngram["parses"]["date"] = util.normalize( ngram["parses"]["date"], key="date") with open(filename[:-3] + 'json', 'r') as fp: labels = simplejson.loads(fp.read()) fields = {} for field in FIELDS: if field in labels: if FIELDS[field] == FIELD_TYPES["amount"]: fields[field] = util.normalize(labels[field], key="amount") elif FIELDS[field] == FIELD_TYPES["date"]: fields[field] = util.normalize(labels[field], key="date") else: fields[field] = labels[field] else: fields[field] = '' data = { "fields": fields, "nGrams": ngrams, "height": height, "width": width, "filename": os.path.abspath( os.path.join(args.out_dir, phase, os.path.basename(filename)[:-3] + 'png')) } with open( os.path.join(args.out_dir, phase, os.path.basename(filename)[:-3] + 'json'), 'w') as fp: fp.write(simplejson.dumps(data, indent=2)) except: if "png" in filename or "jpg" in filename: import cv2 page = cv2.imread(filename) cv2.imwrite( os.path.join(args.out_dir, phase, os.path.basename(filename)[:-3] + 'png'), page) height = page.shape[0] width = page.shape[1] ngrams = util.create_ngrams(page, height, width) for ngram in ngrams: if "amount" in ngram["parses"]: ngram["parses"]["amount"] = util.normalize( ngram["parses"]["amount"], key="amount") if "date" in ngram["parses"]: ngram["parses"]["date"] = util.normalize( ngram["parses"]["date"], key="date") with open(filename[:-3] + 'json', 'r') as fp: labels = simplejson.loads(fp.read()) fields = {} for field in FIELDS: if field in labels: if FIELDS[field] == FIELD_TYPES["amount"]: fields[field] = util.normalize(labels[field], key="amount") elif FIELDS[field] == FIELD_TYPES["date"]: fields[field] = util.normalize(labels[field], key="date") else: fields[field] = labels[field] else: fields[field] = '' data = { "fields": fields, "nGrams": ngrams, "height": height, "width": width, "filename": os.path.abspath( os.path.join( args.out_dir, phase, os.path.basename(filename)[:-3] + 'png')) } with open( os.path.join( args.out_dir, phase, os.path.basename(filename)[:-3] + 'json'), 'w') as fp: fp.write(simplejson.dumps(data, indent=2))
def _prepare_data(self): self._get_inputs() if self.args["data_dir"] == '': messagebox.showerror("Error", "Data folder does not exist!") return if not os.path.exists(self.args["data_dir"]): messagebox.showerror("Error", "Data folder does not exist!") return self.progressbar["value"] = 0 self.progress_label.configure(text="Preparing Data:") os.makedirs(os.path.join(self.args["prepared_data"], 'train'), exist_ok=True) os.makedirs(os.path.join(self.args["prepared_data"], 'val'), exist_ok=True) filenames = [ os.path.abspath(f) for f in glob.glob(self.args["data_dir"] + "**/*.pdf", recursive=True) ] random.shuffle(filenames) idx = int(len(filenames) * 0.2) train_files = filenames[idx:] val_files = filenames[:idx] self.logger.log("Total: {}".format(len(filenames))) self.logger.log("Training: {}".format(len(train_files))) self.logger.log("Validation: {}".format(len(val_files))) total_samples = len(filenames) sample_idx = 0 for phase, filenames in [('train', train_files), ('val', val_files)]: self.logger.log("Preparing {} data...".format(phase)) for filename in tqdm(filenames): # try: page = pdf2image.convert_from_path(filename)[0] page.save( os.path.join(self.args["prepared_data"], phase, os.path.basename(filename)[:-3] + 'png')) height = page.size[1] width = page.size[0] ngrams = util.create_ngrams(page) for ngram in ngrams: if "amount" in ngram["parses"]: ngram["parses"]["amount"] = util.normalize( ngram["parses"]["amount"], key="amount") if "date" in ngram["parses"]: ngram["parses"]["date"] = util.normalize( ngram["parses"]["date"], key="date") with open(filename[:-3] + 'json', 'r') as fp: labels = simplejson.loads(fp.read()) fields = {} for field in FIELDS: if field in labels: if FIELDS[field] == FIELD_TYPES["amount"]: fields[field] = util.normalize(labels[field], key="amount") elif FIELDS[field] == FIELD_TYPES["date"]: fields[field] = util.normalize(labels[field], key="date") else: fields[field] = labels[field] else: fields[field] = '' data = { "fields": fields, "nGrams": ngrams, "height": height, "width": width, "filename": os.path.abspath( os.path.join(self.args["prepared_data"], phase, os.path.basename(filename)[:-3] + 'png')) } with open( os.path.join(self.args["prepared_data"], phase, os.path.basename(filename)[:-3] + 'json'), 'w') as fp: fp.write(simplejson.dumps(data, indent=2)) # except Exception as exp: # self.logger.log("Skipping {} : {}".format(filename, exp)) sample_idx += 1 self.progress_label.configure( text="Preparing data [{}/{}]:".format( sample_idx, total_samples)) self.progressbar["value"] = (sample_idx / total_samples) * 100 self.progressbar.update() self.progress_label.configure(text="Completed!") self.progressbar["value"] = 100 self.progressbar.update() self.logger.log("Prepared data stored in '{}'".format( self.args["prepared_data"]))