def post(self): try: ts = time.time() save_path = PDF_UPLOAD_DIRECTORY file = request.files['file'] xml_file = request.files['xml_file'] xml_file_name = xml_file.filename.replace(' ', '_') file_name = file.filename.replace(' ', '_') file_name_without_ext = os.path.basename(file_name).split('.')[0] file_name_without_ext = file_name_without_ext + "_" + str( uuid.uuid1()) extension = path.splitext(file_name)[1] file_name = file_name_without_ext + extension #path.splitext(file_name)[1] doc_dir_location = os.path.join(save_path, file_name_without_ext) if not os.path.exists(doc_dir_location): os.makedirs(doc_dir_location) file_location = os.path.join(doc_dir_location, file_name) xml_file_location = "" xml_file_location = os.path.join(doc_dir_location, xml_file_name) print(f"-----CML_file--->{xml_file_location}") xml_file.save(os.path.join(doc_dir_location, xml_file_name)) file.save(file_location) #erosion_val = [0, 3, 2, 4] erosion_val = [0] #erosion_val = [0] max_try = len(erosion_val) - 1 for index, e_val in enumerate(erosion_val): print("EROSION_VALUE-------->", e_val) if extension.lower() in ['.jpg', '.jpeg', '.png']: result = read_scanned_image(file_location, doc_dir_location, e_val) else: result = read_scanned_pdf(file_location, doc_dir_location, e_val) text_file_path = os.path.join(PDF_UPLOAD_DIRECTORY, file_name_without_ext, 'texts', 'stitched.txt') #with open( text_file_path ) as fp: # contents = fp.readlines() #result = get_extraction() #result = get_fidelity_extraction(xml_file_location) #print(f"1*****RESULT*****{result}") result = extract_data( os.path.join(PDF_UPLOAD_DIRECTORY, file_name_without_ext, 'texts'), xml_file_location) #print(f"2*****RESULT*****{result}") result['pdf_file_path'] = 'pdf_file/' + file_name_without_ext result[ 'excel_file_path'] = 'text_file/' + file_name_without_ext #parse_all_fields(contents, result) te = time.time() print(f"Time Taken---->{ts - te}") #print(f"Time Taken---->{result}") return jsonify({"data": result}) except CustomClassifierException as e: print("1***ERROR***", e) logging.error("Error {} has occurred in controller".format(e)) return e.response, e.http_code except Exception as e: print("2***ERROR***", e) logging.error("Error in service = {}".format(e), exc_info=True) return InternalServerErrorException( error_code=500, error_message="Data Extraction failed!").response, 500 finally: logging.info("API Call Finished Successfully - 200")
# exit() #else: # continue print("fp---->", fp, os.path.basename(fp).split('.')[0]) extension = os.path.basename(fp).split('.')[1] if extension in ['png', 'jpg', 'jpeg']: continue if fp.startswith('.'): continue file_name_without_ext = os.path.basename(fp).split('.')[0] filename = path + '/' + fp doc_dir_location = os.path.join(upload_path, file_name_without_ext) if not os.path.exists(doc_dir_location): os.makedirs(doc_dir_location) result = read_scanned_pdf(filename, doc_dir_location) try: text_file_path = os.path.join(upload_path, file_name_without_ext, 'texts', 'stitched.txt') with open(text_file_path) as fp1: contents = fp1.readlines() result = parse_all_fields(contents, {}) print("RESULT**************************>", result) if result["username"] == "": name_count = name_count + 1 if result["program_name"] == "": program_name_count = program_name_count + 1
def post(self): try: save_path = PDF_UPLOAD_DIRECTORY file = request.files['file'] file_name = file.filename.replace(' ', '_') file_name_without_ext = os.path.basename(file_name).split('.')[0] file_name_without_ext = file_name_without_ext + "_" + str( uuid.uuid1()) file_name = file_name_without_ext + path.splitext(file_name)[1] doc_dir_location = os.path.join(save_path, file_name_without_ext) if not os.path.exists(doc_dir_location): os.makedirs(doc_dir_location) print(doc_dir_location) file_location = os.path.join(doc_dir_location, file_name) print(file_location) file.save(file_location) #result = read_scanned_pdf(req_payload.get('pdf_path'), req_payload.get('output_dir_location')) result = read_scanned_pdf(file_location, doc_dir_location) parse_file = result['parse_file'] template_file = self.create_template(doc_dir_location) """ print("RESULT--->", result) stitched_pdf_path = result['stitched_pdf_path'] template_file_path = os.path.dirname(result['stitched_pdf_path']) abby_file_path = os.path.dirname(result['stitched_pdf_path']).replace('pages', 'texts') #abby_text_path = os.path.join(abby_file_path, 'output.txt' ) abby_text_path = os.path.join(abby_file_path, 'stitched.txt' ) print("stitched_pdf_path-->", stitched_pdf_path) print("abby_file_path----->", abby_file_path) print("abby_text_path----->", abby_text_path) #extract_to_docx( stitched_pdf_path, abby_text_path) template_file = self.create_template( template_file_path ) """ #===START= print("==0==", parse_file) with open(parse_file, encoding='utf-8') as fp: contents = fp.readlines() print("==1==") obj = ModelPdfData(contents) obj.prepare_data() obj.compare_with_keywords(keywords) obj.list_data() print("TEMPLATe_FILE**", template_file) ew = ExcelWriter(template_file) ew.update(obj.data) #===END=== result['excel_file_path'] = 'excel_file/' + file_name_without_ext result['pdf_file_path'] = 'pdf_file/' + file_name_without_ext return jsonify({"data": result}) #return formulate_response(result, 200, "Successfully Extracted") except CustomClassifierException as e: print("1***ERROR***", e) logging.error("Error {} has occurred in controller".format(e)) return e.response, e.http_code except Exception as e: print("2***ERROR***", e) logging.error("Error in service = {}".format(e), exc_info=True) return InternalServerErrorException( error_code=500, error_message="Data Extraction failed!").response, 500 finally: logging.info("API Call Finished Successfully - 200")