def perform_ocr(self): # upload_files_len = len(glob.glob(os.path.join(OUTPUT_DIR, "*.pdf"))) # download_files_len = len(glob.glob(os.path.join(INPUT_DIR, "*.pdf"))) while True: input_files = glob.glob(os.path.join(INPUT_DIR, "*.*")) for pdf_path in input_files: try: pdf_name = ntpath.basename(pdf_path) extension = pdf_name[pdf_name.rfind(".") + 1:] if extension != "pdf": continue if pdf_name not in self.processed_files: print(f"[INFO] {pdf_name} processing...") extracted_info = self.pdf_extractor.main( pdf_path=pdf_path) output_pdf_path = self.pdf_creator.repopulate_pdf( info=extracted_info, pdf_name=pdf_name) self.s3_manager.upload_files(file_path=output_pdf_path) self.processed_files.append(pdf_name) except Exception as e: log_print(e) # upload_files_len = len(glob.glob(os.path.join(OUTPUT_DIR, "*.pdf"))) # download_files_len = len(glob.glob(os.path.join(INPUT_DIR, "*.pdf"))) content = "" for i, file_name in enumerate(self.processed_files): if i < len(self.processed_files) - 1: content += file_name + "\n" else: content += file_name save_file(content=content, filename=PROCESSED_FILE, method='w')
def create_corpus(): text_preprocessor = TextPreprocessor() keyword_df = pd.read_csv(KEYWORD_CSV_FILE_PATH) keywords = keyword_df["Keyword"].values.tolist() word_freq = {} for keyword in keywords: token_words = text_preprocessor.tokenize_word(sample=keyword) for t_word in token_words: if t_word == "": continue if not t_word.isalpha(): continue if t_word not in word_freq.keys(): word_freq[t_word] = 1 else: word_freq[t_word] += 1 save_file(filename=CORPUS_PATH, content=json.dumps(word_freq, indent=4), method="w") print(f"[INFO] Successfully saved corpus in {CORPUS_PATH}") return
def extract_ocr_local(self, frame_path): file_name = ntpath.basename(frame_path).replace(".jpg", "") # Read document content with open(frame_path, 'rb') as document: image_bytes = bytearray(document.read()) # Call Amazon Textract response = self.textract.detect_document_text( Document={'Bytes': image_bytes}) json_file_path = os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}.json") save_file(filename=json_file_path, content=json.dumps(response, indent=4), method="w") return
def process_ocr_text(frame_path): container = "" temp_paths = separate_frame_by_size(f_path=frame_path) for i, t_path in enumerate(temp_paths): image_ocr_json = google_ocr.detect_text(img_path=t_path) if LOCAL: json_file_path = os.path.join(CUR_DIR, 'temp', "temp_{}_{}.json".format( ntpath.basename(frame_path).replace(".jpg", ""), ntpath.basename(t_path).replace(".jpg", ""))) save_file(filename=json_file_path, content=json.dumps(image_ocr_json), method="w") container += extract_text_from_json(json_content_=image_ocr_json, path_=t_path, part_idx=i) + "\n" return container
def process_ocr_text(self, frame_path, file_name): sharpen_frame_path = convert_image_color(frame_path=frame_path, file_name=file_name) image_ocr_json = self.google_ocr.detect_text(path=sharpen_frame_path) if LOCAL: json_file_path = os.path.join(CUR_DIR, 'temp', "temp_{}.json".format(file_name)) save_file(filename=json_file_path, content=json.dumps(image_ocr_json), method="w") self.extract_whole_info(frame_path=sharpen_frame_path, json_data=image_ocr_json) save_path = import_info_into_excel(info=self.info, file_name=file_name) return save_path
def process_ocr_text(frame_path): image_ocr_json = google_ocr.detect_text(img_path=frame_path) if LOCAL: json_file_path = os.path.join(CUR_DIR, 'temp', "temp.json") save_file(filename=json_file_path, content=json.dumps(image_ocr_json), method="w") content = extract_table_content(json_content=image_ocr_json, frame_path=frame_path) table_text = "" for row_id in content.keys(): for col_id in content[row_id].keys(): table_text += "'" + content[row_id][col_id] + "'" + "," table_text += "\n" return table_text
def calculate_optimum_threshold(f_array, t_array, thresh_array): opt_index = None diff = abs(f_array[0] - OPT_THRESH) for i, f_value in enumerate(f_array): if f_value > OPT_THRESH: continue if abs(f_value - OPT_THRESH) < diff: opt_index = i diff = abs(f_value - OPT_THRESH) if opt_index is not None: opt_fpr = f_array[opt_index] opt_thresh = thresh_array[opt_index] opt_tpr = t_array[opt_index] print("Optimum Threshold for test data: ", opt_thresh) print("test_data FPR: ", opt_fpr) print("test_data TPR: ", opt_tpr) save_file(content=str(opt_thresh), filename=OPT_THRESH_PATH, method='w')
def download_files(invoice_name_str): if g.user: total_invoice_info = {} invoices = invoice_name_str.split(",")[:-1] for f_invoice in invoices: f_invoice_info = db_manager.get_invoice_info(file_name=f_invoice) total_invoice_info[f_invoice] = { "Barcode": f_invoice_info[0], "Lieferschein_Nr": f_invoice_info[1], "DTS_Date": f_invoice_info[2], "DTS_Time": f_invoice_info[3], "Gewicht": f_invoice_info[4], "Volume": f_invoice_info[5], "Fuhre": f_invoice_info[6] } output_file_path = os.path.join(OUTPUT_DIR, 'result.json') save_file(filename=output_file_path, content=json.dumps(total_invoice_info, indent=4), method="w") return send_file(output_file_path, as_attachment=True) else: return render_template('login.html')
def detect_text(self, img_path, file_name, dir_name=None): """ Uses the Vision API to detect text in the given file. """ request_list = [] feature_type = 'DOCUMENT_TEXT_DETECTION' with open(img_path, 'rb') as img_file: content_json_obj = { 'content': base64.b64encode(img_file.read()).decode('UTF-8') } feature_json_obj = [{'type': feature_type}] request_list.append({ 'image': content_json_obj, 'features': feature_json_obj }) request = self.service.images().annotate( body={'requests': request_list}) try: response = request.execute() ret_json = response['responses'][0] if LOCAL: json_file_path = os.path.join( CUR_DIR, 'test_json', f"temp_{dir_name}_{file_name}.json") save_file(filename=json_file_path, content=json.dumps(ret_json), method="w") return ret_json except Exception as e2: print("Key error: %s" % e2)
def split_dataset_darknet(): img_dir = os.path.join(CUR_DIR, 'darknet', 'custom_data', 'images') image_paths = glob.glob(os.path.join(img_dir, "*.jpg")) xml_paths = glob.glob( os.path.join(CUR_DIR, 'training_dataset', 'xml', '*.txt')) image_indices = [] for image_path in image_paths: _, index = get_index_from_file_path(path=image_path) if index != "": image_indices.append(index) for xml_path in xml_paths: xml_name, xml_index = get_index_from_file_path(path=xml_path) if xml_index in image_indices: xml_content = load_text(filename=xml_path) xml_content = xml_content.replace("15", "0") new_file_path = os.path.join(CUR_DIR, 'darknet', 'custom_data', 'images', xml_name) save_file(content=xml_content, filename=new_file_path, method='w') shuffle(image_indices) train_index = int(0.8 * len(image_indices)) training_indices, test_indices = image_indices[: train_index], image_indices[ train_index:] for idx in training_indices: path = "custom_data/images/image_{}.jpg".format(idx) + "\n" save_file(content=path, filename=os.path.join(CUR_DIR, 'darknet', 'custom_data', 'train.txt'), method='a') for idx in test_indices: path = "custom_data/images/image_{}.jpg".format(idx) + "\n" save_file(content=path, filename=os.path.join(CUR_DIR, 'darknet', 'custom_data', 'test.txt'), method='a')
processed_file_names = [] for processed_f_path in processed_files: f_name = extract_file_name(file_path=processed_f_path) processed_file_names.append(f_name) total_lens = len(input_image_path) for i, path in enumerate(input_image_path): file_name = ntpath.basename(path).replace(".jpg", "") if file_name in processed_file_names: continue print("Process {}-({} / {})".format(path, i + 1, total_lens)) try: frame_content = process_ocr_text(frame_path=path) txt_file_path = os.path.join(OUTPUT_DIR, "{}.txt".format(file_name)) save_file(content=frame_content, filename=txt_file_path, method='w') log_print(info_str=path + "\n" + "Successfully processed") print("Successfully processed {}".format(path)) except Exception as e: log_print(info_str=path) log_print(info_str=e) for jpg_path in glob.glob(os.path.join(OUTPUT_DIR, "*.jpg")): os.remove(jpg_path)
def extract_page_info(self, pdf_page_frame_path, file_name=None, index=None, ocr_result=None): if os.path.exists(os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}_{index}.json")): with open(os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}_{index}.json")) as f: ocr_result = json.load(f) if ocr_result is None: ocr_result = self.ocr_tool.detect_text(img_path=pdf_page_frame_path) if LOCAL: json_file_path = os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}_{index}.json") save_file(filename=json_file_path, content=json.dumps(ocr_result), method="w") pdf_page_frame = cv2.imread(pdf_page_frame_path) height, width = pdf_page_frame.shape[:2] page_title_frame = pdf_page_frame[:int(height / 8), :] title_frame_path = os.path.join(CUR_DIR, 'title.jpg') cv2.imwrite(title_frame_path, page_title_frame) title_ocr = self.ocr_tool.detect_text(img_path=title_frame_path) title_json = title_ocr["textAnnotations"][1:] needed_ocr = ocr_result["textAnnotations"][1:] self.box_row_lines, self.box_col_lines = extract_box_lines(frame_path=pdf_page_frame_path, json_info=needed_ocr) for j, pdf_json in enumerate(title_json): if pdf_json["description"].lower() == "ncic": if not self.report_page_ret: report_number, crash_date_time, crash_severity, county, ncic, agency, units, unit_error = \ self.extract_report_page(json_info=needed_ocr, frame_path=pdf_page_frame_path) # min_dist = needed_ocr[0]["boundingPoly"]["vertices"][0]["x"] + \ # needed_ocr[0]["boundingPoly"]["vertices"][0]["y"] # state = needed_ocr[0]["description"] # for _json in needed_ocr[1:]: # dist = _json["boundingPoly"]["vertices"][0]["x"] + _json["boundingPoly"]["vertices"][0]["y"] # if dist < min_dist: # min_dist = dist # state = _json["description"] # self.pdf_info["report"]["state"] = state county = str(county).replace(",", "").replace(".", "") if len(county) > 2: county = county[:2] self.pdf_info["report"]["report_number"] = \ str(report_number).replace(",", "").replace(".", "").replace(":", "") self.pdf_info["report"]["crash_date_time"] = \ str(crash_date_time).replace(",", "").replace(".", "").replace(":", "") # self.pdf_info["report"]["crash_date"] = str(crash_date) # self.pdf_info["report"]["crash_time"] = str(crash_time) self.pdf_info["report"]["county"] = county self.pdf_info["report"]["agency_ncic"] = \ str(ncic).replace(",", "").replace(".", "").replace(":", "") self.pdf_info["report"]["agency"] = str(agency).replace(",", "").replace(".", "").replace(":", "") self.pdf_info["report"]["number_of_unit"] = \ str(units).replace(",", "").replace(".", "").replace(":", "") self.pdf_info["report"]["unit_in_error"] = \ str(unit_error).replace(",", "").replace(".", "").replace(":", "") self.pdf_info["report"]["crash_severity"] = \ str(crash_severity).replace(",", "").replace(".", "").replace(":", "") self.report_page_ret = True break elif pdf_json["description"].lower() == "unit" and abs(pdf_json["boundingPoly"]["vertices"][0]["y"] - pdf_json["boundingPoly"]["vertices"][3]["y"]) > 30: temp_dict = {} unit_number, owner_name, owner_address, owner_phone, occupants, damage_scale, insurance_company, \ policy_number, year, make, model, unit_type = \ self.extract_unit_page(json_info=needed_ocr, frame_path=pdf_page_frame_path) temp_dict["unit_number"] = unit_number.replace(",", "").replace(".", "").replace(":", "") temp_dict["owner_name"] = owner_name.replace(",", "").replace(".", "").replace(":", "") temp_dict["owner_address"] = owner_address.replace(",", "").replace(".", "").replace(":", "") temp_dict["owner_phone"] = owner_phone.replace(",", "").replace(".", "").replace(":", "") temp_dict["number_of_occupants"] = occupants.replace(",", "").replace(".", "").replace(":", "") temp_dict["damage_scale"] = damage_scale.replace(",", "").replace(".", "").replace(":", "") temp_dict["insurance_company"] = insurance_company.replace(",", "").replace(".", "").replace(":", "") temp_dict["policy_number"] = policy_number.replace(",", "").replace(".", "").replace(":", "") temp_dict["year"] = year.replace(",", "").replace(".", "").replace(":", "")[-4:] temp_dict["make"] = make.replace(",", "").replace(".", "").replace(":", "") temp_dict["model"] = model.replace(",", "").replace(".", "").replace(":", "") temp_dict["unit_type"] = unit_type.replace(",", "").replace(".", "").replace(":", "") blank_page = True for t_key in temp_dict.keys(): if temp_dict[t_key] != "": blank_page = False break if not blank_page: self.pdf_info["unit"].append(temp_dict) break elif pdf_json["description"].lower() == "motorist" or pdf_json["description"].lower() == "occupant": info = self.extract_motorist_occupant_page(json_info=needed_ocr, frame_path=pdf_page_frame_path) if pdf_json["description"].lower() == "motorist": self.pdf_info["motorist"] = info else: self.pdf_info["occupant"] = info break os.remove(title_frame_path) return self.pdf_info
from src.ocr.ocr_text import process_ocr_text from utils.folder_file_manager import save_file from settings import RESULT_FILE_PATH, INPUT_IMAGE_PATH if __name__ == '__main__': result = process_ocr_text(frame_path=INPUT_IMAGE_PATH) save_file(content=result, filename=RESULT_FILE_PATH, method='w') print("Successfully saved in {}".format(RESULT_FILE_PATH))