def digitalpdf_data_extraction(paths, digital_text, scanned): digital_text = ["" if i is None else i for i in digital_text] content_type = "UNKNOWN" module_name = "" for index in range(len(digital_text)): disc_con_typ = discover_content_type(digital_text[index]) logger.info("discovered content type" + disc_con_typ[0]) if disc_con_typ[0] is not "UNKNOWN": logger.info("inside not unknown") content_type = disc_con_typ[0] module_name = disc_con_typ[1] digital_text = digital_text[index:len(digital_text)] # removing pages that did not have usable data if index > 0: mergeObj = PdfFileMerger() pdfObj = convert_from_path(paths["filepath"], first_page=index + 1, last_page=len(digital_text)) for page in pdfObj: page.save(paths["inputpath"] + "/temp.pdf") temp = PdfFileReader(paths["inputpath"] + "/temp.pdf") mergeObj.append(temp) os.remove(paths["inputpath"] + "/temp.pdf") mergeObj.write(paths["filepath"]) pages = pdfplumber.open(paths["filepath"]).pages if scanned: if len(pages) > 1: create_tiff(paths) break return [content_type, module_name, digital_text]
def run_ml_api(): try: x = {} upload_folder = config.get('api', 'upload_folder') path = upload_folder + str(uuid.uuid4()) if len(request.files) == 0: x["code"] = -1 x["message"] = "no file received" return jsonify(x) print("identified") file = request.files[(list(request.files))[0]] paths = init_dir(path) doc_type_or_x = file_type(paths, file) logger.info(doc_type_or_x) print("file type done") if type(doc_type_or_x) is dict: return jsonify(doc_type_or_x) doc_type = doc_type_or_x print("doc type is identified") file_size_validity = verify_file_size(paths["filepath"]) print("bbefore size") if file_size_validity == False: x["code"] = -1 x["message"] = "File size exceeded" else: params = {"spam_flag": "", "structuredParsing_flag": "", "unstructuredParsing_flag": "", "doc_type": doc_type,"pass_header": ""} req = request.headers params = processing_headers(req, params) x = process_forms(paths, params) logger.info(x) return jsonify(x) except: return jsonify({"code":1, "message":"file not suported"})
def verify_file_size(filepath): max_size = int(config.get('api', 'max_file_size')) * 1024 * 1024 # reading in MB converting to bytes file_size = path.getsize(filepath) logger.info("Size of the file is " + str(file_size)) if file_size > max_size: return False return True
def flush_dir(paths): logger.info("now cleaning temp directories") rmtree(paths["inputpath"]) rmtree(paths["imagepath"]) rmtree(paths["deskewpath"]) rmtree(paths["ocrpath"]) rmtree(paths["cbimagepath"]) rmtree(paths["tmpdirpath"])
def decode(password): logger.info("inside decode pass for: " + password) logger.info("init java done: ") password = subprocess.check_output([ "java", "-jar", "/".join( path.dirname(path.abspath(__file__)).split("/") + ["extdata", "RXCODEC.jar"]), password ]).decode("ascii")[:-1] return password
def get_passwd(pass_header): if len(pass_header) == 0: passw = "" else: try: passw = decode(pass_header) logger.info("decoded pass: "******"\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) passw = "" return passw
def pdf_server(status, name): """ html转pdf服务,提供html文件、网址、字符串,返回压缩后的pdf文件 :return: jsonj结果 """ try: logging.info('get_task') if status == 'file': html_data = base64.b64decode(name) with open(Config.html_file, 'wb') as f: f.write(html_data) result = html_pdf_main(status, name) logging.info('result success') return result except Exception as e: logging.exception('%s', e) return {'status': 1, 'error': e}
def process_json(x): y = pviJSON.__addWithValuesToDictObject__(x) for value in ["products", "events"]: if value in y.keys(): if len(y[value]) < 1: logger.info("no %s exist" % value) del y[value] else: products_list = [count_nan(each, 0, 0) for each in y[value]] y[value] = [ y[value][i] for i in range(len(products_list)) if products_list[i][1] != products_list[i][2] ] if len(y[value]) < 1: logger.info("no %s exist" % value) del y[value] return y
def scannedpdf_data_extraction(paths,scan): logger.info("deskew and hocr generator for unknown content type") create_tiff(paths) pdf = pdfplumber.open(paths["ocrpath"] + "/form.pdf") pages = pdf.pages content_type = "UNKNOWN" module_name = "" complete_ocr_text = "" for page in pages: ocr_text = page.extract_text() complete_ocr_text = complete_ocr_text + "\n" + ocr_text content_type_module_name = discover_content_type(ocr_text) logger.info("discovered content type " + content_type_module_name[0]) if content_type_module_name[0] is not "UNKNOWN": content_type = content_type_module_name[0] module_name = content_type_module_name[1] text = ocr_text logger.info("OCR_text_content_type: " + content_type) if page.page_number >= 2: mergeObj = PdfFileMerger() pdfObj = convert_from_path(paths["filepath"], first_page=page.page_number) for tmp_page in pdfObj: tmp_page.save(paths["inputpath"] + "/out_form.pdf") temp = PdfFileReader(paths["inputpath"] + "/out_form.pdf") mergeObj.append(temp) os.remove(paths["inputpath"] + "/out_form.pdf") mergeObj.write(paths["filepath"]) break return [content_type, module_name, complete_ocr_text]
def validate_password(filepath, passw): logger.info("inside if_passwd_works: " + passw) x = {"code": None, "message": None} try: temp = pdfplumber.open(filepath, password=passw) except PDFPasswordIncorrect as err: logger.error(err) logger.info(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) x["code"] = 6 x["message"] = "pdf file is password protected. Kindly provide right password." return x except Exception as err: logger.error(err) logger.info(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) x["code"] = 1 x["message"] = "Can not open file. Please check password or file type." return x return x
def zip_server(): """ png压缩服务,上传png文件,返回压缩后的png文件 :return: jsonj结果 """ try: file = request.files['img'] # 获取上传的文件 if file and allowed_file( file.filename, Config.PNG_ALLOWED_EXTENSIONS): # 如果文件存在并且符合要求则为 true logging.info('get_task' + str(file.filename)) filename = secure_filename(file.filename) # 获取上传文件的文件名 file.save(os.path.join(Config.image_save_path, filename)) # 保存文件 logging.info('result success') return result else: logging.info('result' + 'no task') return {'status': 'no task'} except Exception as e: logging.exception('%s', e) return {'status': 1, 'error': e}
def init_dir(path): logger.info("creating temp directories") paths = {"tmpdirpath": path, "inputpath": path + "/input", "ocrpath": path + "/ocr", "deskewpath": path + "/deskew", "imagepath": path + "/image", "cbimagepath": path + "/cb_images"} try: mkdir(paths["tmpdirpath"]) mkdir(paths["inputpath"]) mkdir(paths["inputpath"] + "/sample_dir") mkdir(paths["imagepath"]) mkdir(paths["cbimagepath"]) mkdir(paths["deskewpath"]) mkdir(paths["ocrpath"]) mkdir(paths["ocrpath"] + "/ext_sec_data") except OSError: logger.info("Creation of the directory %s failed" % path) logger.critical(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) else: logger.info("Successfully created the directory %s " % path) return paths
def spam_identification(x, y, spam_flag, text, paths): if (not ("products" in y.keys() and "events" in y.keys()) or (len(y["products"]) == 1 and y["products"][0].license_value is None)): logger.info("no product or event exists") if not spam_flag: x = { "code": 2, "message": "Spam detection is set False. Form may or may not contain valid case data." } return x if os.path.exists(paths["ocrpath"] + "/spam.txt"): os.remove(paths["ocrpath"] + "/spam.txt") file = open(paths["ocrpath"] + "/spam.txt", "w") file.write(str(text)) file.close() logger.info("text going in spam is : ", text) py_comm_comp.spam_detect.spam_detector( paths["ocrpath"] + "/spam.txt", spam_model, paths["ocrpath"] + "/result.json") result_json = paths["ocrpath"] + "/result.json" with open(result_json, "r") as data: t = json.load(data) logger.info(t) if t["label"] == "spam" or (len(y["products"]) == 1 and y["products"][0].license_value is None): x["code"] = 4 x["message"] = "spam" try: x["spam_acc"] = float( "%.2f" % t["label_accu"] ) # still dont know what 2 does in as.numeric(.., 2) except: x["spam_acc"] = None return x return x
def check_pqc(paths, text): py_path = "/".join( path.dirname(path.abspath(__file__)).split("/")[:-1] + ["unstructured", "extdata"]) if os.path.exists(paths["ocrpath"] + "/pqc-ae.txt"): os.remove(paths["ocrpath"] + "/pqc-ae.txt") file = open(paths["ocrpath"] + "/pqc-ae.txt", "w") file.write(str(text)) file.close() logger.info("text going in pqc-ae is : ", text) py_comm_comp.pqc_ae_detect.pqc_ae_detector( paths["ocrpath"] + "/pqc-ae.txt", pqc_ae_model, paths["ocrpath"] + "/result_pqc_ae.json") result_json = paths["ocrpath"] + "/result_pqc_ae.json" with open(result_json, "r") as data: z = json.load(data) # t = json.dumps(t) logger.info(z) case_category = z["label"] case_category_accu = z["label_accu"] logger.info("case_category_accu is --------------------", case_category_accu) case_category = case_category.split("-") logger.info("case_category value is ") logger.info(case_category) category_flag = False if case_category == ["MI"] or checkMalfunctions(text, py_path): category_flag = True logger.info("category flag value is ") logger.info(category_flag) if len(case_category) > 0: for every_category in range(len(case_category)): if case_category[every_category] == "AE": case_category[every_category] = "AE Case" elif case_category[every_category] == "MI": case_category[every_category] = "Medical Inquiry" return case_category, case_category_accu, category_flag
def unstructured_form_parsing(paths, text, unstructuredParsing_flag, spam_flag): logger.info("unstructured parsing in progress") x = {"code": None, "message": None, "spam_acc": None} try: if text.strip() == "": pdfObj = pdfplumber.open(paths["filepath"]) pagesObj = pdfObj.pages for page in pagesObj: if page.page_number == 1: fileName = paths["ocrpath"] + "/ocred_text.txt" file = open(fileName, "rb") text = file.read().decode("ASCII") os.remove(fileName) logger.info(text) else: # do ocr and append text logger.info( "unstructured: doing ocr for extended pages, page_num: " + page) imageObj = page.to_image(resolution=300) imageObj.save(paths["imagepath"] + "/form.tiff", format="tiff") fileName = paths["ocrpath"] + "/ocred.txt" file = open(fileName, "w") file.write( str( pytesseract.image_to_string( Image.open(paths["imagepath"] + "/form.tiff")))) file.close() fileName = paths["ocrpath"] + "/ocred_text.txt" file = open(fileName, "rb") text = process_text(paths, text) case_category, case_category_accu, category_flag = check_pqc( paths, text) '''for normal unstructured''' # unstructured.unstructure_pipeline.unstruct_prediction(paths["ocrpath"] + "/test.txt", paths["ocrpath"] + "/unsoutput.json", py_path, category_flag) # for unstructured api cioms_flag = None url_unst1 = config.get('unstructured', 'unstructure_api_url') print("----------------------------------------" + url_unst1) config_path_uns = "/home/ubuntu/pvi-form-engine/structuredForms/py_generic/extdata/config/config.json" config_json_uns = json.load(open(config_path_uns)) url_unst = config_json_uns[ "base-generic-url"] + ":" + "9888/unstruct/live" print("----------------------------------------" + url_unst) #requests.post(url_unst, headers={"input_file": paths["ocrpath"] + "/test.txt", # "output_file": paths["ocrpath"] + "/unsoutput.json", # "PQC_FLAG": str(category_flag), "cioms_flag": str(cioms_flag)}) files = {'file2': open(paths["ocrpath"] + "/test.txt", 'rb')} x = requests.post(url_unst, files=files, headers={ "PQC_FLAG": str(category_flag), "cioms_flag": str(cioms_flag) }) x = x.json() #try: # with open(paths["ocrpath"] + "/unsoutput.json") as data: # x = json.load(data) #except: # with open("/".join(path.dirname(path.abspath(__file__)).split("/")[:-3] + ["temp1.json"])) as data: # x = json.load(data) logger.info("x from JSON") logger.info(x) y = process_json(x) x = spam_identification(x, y, spam_flag, text, paths) if x["message"] == "spam" or x["code"] == 2: return x x["code"] = 6 x["message"] = "Non Form AE Case" if case_category_accu >= 0.98: x["categories"] = case_category return x except Exception: logger.info(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) x = { "code": 3, "message": "error came parsing unstructured AE or checking for spam" } return x
def structured_form_parsing(paths, content_type, module_name): logger.info("parsing structured form") x = {"code": None, "content_type": None, "module_name": None, "message": None} try: logger.info("Content type is " + content_type) logger.info("Module name is " + module_name) logger.info("going for if condition") allowed_form = config.getboolean("forms", content_type) if not allowed_form: return {"code": 1, "content_type": content_type, "module_name": module_name, "message": "Form is recognized from the config file but is not allowed"} # loads python modules for specific forms based on identification ImportModuleName = "py_" + module_name ImportModulePath = "/".join( path.dirname(path.abspath(__file__)).split("/")[:-1] + ["structuredForms", ImportModuleName, "__init__.py"]) # checks for module in site-packages first if not found then searches local git repository for module (easier for development purposes) spec = importlib.util.find_spec(ImportModuleName) if spec is None: spec = util.spec_from_loader(ImportModuleName, importlib.machinery.SourceFileLoader(ImportModuleName, ImportModulePath)) logger.info(spec) if spec is None: logger.info("case where no module dir exists") x["code"] = 5 x["module_name"] = module_name x["message"] = "Form is recognized from the config file but the module is missing" else: try: # if module is found then loads the module and executed parseFromModule() in main.py for that function # new modules created must have parseFromModule() in main.py with one argument for temporary directory path imported_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(imported_module) if module_name == "generic": x = imported_module.main.parseFromModule(paths["tmpdirpath"], content_type) else: x = imported_module.main.parseFromModule(paths["tmpdirpath"]) x["model_type"] = content_type x["module_name"] = module_name logger.info("completed parsing in " + module_name) if x["message"] == "error in Parsing": x["code"] = 1 x["message"] = "This AE form is not configure" x["model_type"] = None x["module_name"] = None else: x["code"] = 5 x["message"] = "Form is recognized from the config file and parsed using module" except Exception as err: logger.info("parseFromModule missing in the " + module_name + " package") logger.error(err) logger.error( ("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) x["code"] = 3 x["module_name"] = module_name x["message"] = "generic-form-parser api is off" return x except Exception as err: logger.error(err) logger.error(("\nstartTrace::::" + traceback.format_exc().strip() + "::::endTrace").replace("\n", "\n$")) x["code"] = 3 x["message"] = "AE form failed to Parse" return x
def find_content_type(paths, doc_type): x = {"code": None, "message": None} # checks for module in site-packages first if not found then searches local git repository for module (easier for development purposes) if doc_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": logger.info("file type is docx") digital_text = get_text(paths["inputpath"] + "/form.docx") logger.info(digital_text) disc_con_typ = discover_content_type(digital_text) logger.info("discovered content type") logger.info(disc_con_typ) if disc_con_typ[0] is not "UNKNOWN": logger.info("inside not unknown") content_type = disc_con_typ[0] module_name = disc_con_typ[1] return [content_type, module_name, digital_text] elif doc_type == "text/plain": logger.info("file type is txt") digital_text = get_text_from_txt(paths["inputpath"] + "/form.txt") logger.info(digital_text) disc_con_typ = discover_content_type(digital_text) logger.info("discovered content type") logger.info(disc_con_typ) if disc_con_typ[0] is not "UNKNOWN": logger.info("inside not unknown") content_type = disc_con_typ[0] module_name = disc_con_typ[1] return [content_type, module_name, digital_text] elif doc_type == "text/csv": logger.info("file type is csv") digital_text = get_text_from_csv(paths["inputpath"] + "/form.csv") logger.info(digital_text) disc_con_typ = discover_content_type(digital_text) logger.info("discovered content type") logger.info(disc_con_typ) if disc_con_typ[0] is not "UNKNOWN": logger.info("inside not unknown") content_type = disc_con_typ[0] module_name = disc_con_typ[1] return [content_type, module_name, digital_text] else: scanned = False logger.info("file type is docx") digital_text = [] try: pdf = pdfplumber.open(paths["filepath"]) except: return {"code": 1, "message": "password protected switch is off"} for page in pdf.pages: digital_text.append(page.extract_text()) if None in digital_text: scanned = True out_digital = digitalpdf_data_extraction(paths,digital_text, scanned) if out_digital[0] is not "UNKNOWN": return out_digital if not config.getboolean('document_type', 'scanned_pdf') and scanned: x["code"] = -1 x["message"] = "scanned pdf not allowed" out_scanned = scannedpdf_data_extraction(paths, scanned) if len(out_scanned[2]) < len(out_digital[2]): out_scanned[2] = out_digital[2] return out_scanned
def process_forms(paths, params): logger.info("list of all paths is ", paths) logger.info("list of all params is ", params) x = {} # setting flag value flags = flag_setter(params["spam_flag"], params["structuredParsing_flag"], params["unstructuredParsing_flag"]) spam_flag = flags[0] structuredParsing_flag = flags[1] unstructuredParsing_flag = flags[2] # checking password logger.info("Pass header is: %s" % params["pass_header"]) passwd = get_passwd(params["pass_header"]) logger.info("decoded pass is: %s" % passwd) if params["doc_type"] == "application/pdf": response = validate_password(paths["filepath"], passwd) logger.info(response) if response['code'] is not None and response['message'] is not None: flush_dir(paths) return response try: pdf_info = pdfinfo_from_path(paths["filepath"], userpw=passwd) decrypt_pdf(paths, pdf_info, passwd) if pdf_info["Pages"] > int(config.get('api', 'max_file_pages')): temp_var = pfw() for i in range(16): temp_var.addPage((pfr(paths["filepath"], 'rb')).getPage(i)) with open((paths["inputpath"] + "/trimmed_form.pdf"), 'wb') as f: temp_var.write(f) os.rename(paths["inputpath"] + "/trimmed_form.pdf", paths["filepath"]) except: pass result = find_content_type(paths, params["doc_type"]) logger.info(result) if type(result) is not list: flush_dir(paths) return result content_type = result[0] module_name = result[1] text = result[2] # parsing if content_type in ["UNKNOWN", "", " "]: response = unstructured_form_parsing(paths, text[0], unstructuredParsing_flag, spam_flag) flush_dir(paths) return response # code for structured from parsing if not structuredParsing_flag: x["model_type"] = content_type x["code"] = 5 x["message"] = "Form is medwatch or CIOMS." flush_dir(paths) return x else: response = structured_form_parsing(paths, content_type, module_name) flush_dir(paths) return response