コード例 #1
0
    def post(self):
        try:
            ts = time.time()
            save_path = PDF_UPLOAD_DIRECTORY
            file = request.files['file']
            xml_file = request.files['xml_file']
            xml_file_name = xml_file.filename.replace(' ', '_')

            file_name = file.filename.replace(' ', '_')
            file_name_without_ext = os.path.basename(file_name).split('.')[0]
            file_name_without_ext = file_name_without_ext + "_" + str(
                uuid.uuid1())
            extension = path.splitext(file_name)[1]
            file_name = file_name_without_ext + extension  #path.splitext(file_name)[1]
            doc_dir_location = os.path.join(save_path, file_name_without_ext)
            if not os.path.exists(doc_dir_location):
                os.makedirs(doc_dir_location)
            file_location = os.path.join(doc_dir_location, file_name)
            xml_file_location = ""
            xml_file_location = os.path.join(doc_dir_location, xml_file_name)
            print(f"-----CML_file--->{xml_file_location}")
            xml_file.save(os.path.join(doc_dir_location, xml_file_name))
            file.save(file_location)

            #erosion_val = [0, 3, 2, 4]
            erosion_val = [0]
            #erosion_val = [0]
            max_try = len(erosion_val) - 1
            for index, e_val in enumerate(erosion_val):
                print("EROSION_VALUE-------->", e_val)

                if extension.lower() in ['.jpg', '.jpeg', '.png']:
                    result = read_scanned_image(file_location,
                                                doc_dir_location, e_val)
                else:
                    result = read_scanned_pdf(file_location, doc_dir_location,
                                              e_val)

                text_file_path = os.path.join(PDF_UPLOAD_DIRECTORY,
                                              file_name_without_ext, 'texts',
                                              'stitched.txt')
                #with open( text_file_path ) as fp:
                #		contents = fp.readlines()

                #result = get_extraction()
                #result = get_fidelity_extraction(xml_file_location)
                #print(f"1*****RESULT*****{result}")
                result = extract_data(
                    os.path.join(PDF_UPLOAD_DIRECTORY, file_name_without_ext,
                                 'texts'), xml_file_location)
                #print(f"2*****RESULT*****{result}")
                result['pdf_file_path'] = 'pdf_file/' + file_name_without_ext
                result[
                    'excel_file_path'] = 'text_file/' + file_name_without_ext
                #parse_all_fields(contents, result)

                te = time.time()
                print(f"Time Taken---->{ts - te}")
                #print(f"Time Taken---->{result}")
                return jsonify({"data": result})

        except CustomClassifierException as e:
            print("1***ERROR***", e)
            logging.error("Error {} has occurred in controller".format(e))
            return e.response, e.http_code

        except Exception as e:
            print("2***ERROR***", e)
            logging.error("Error in service = {}".format(e), exc_info=True)
            return InternalServerErrorException(
                error_code=500,
                error_message="Data Extraction failed!").response, 500

        finally:
            logging.info("API Call Finished Successfully - 200")
コード例 #2
0
        #		exit()
        #else:
        #		continue
        print("fp---->", fp, os.path.basename(fp).split('.')[0])
        extension = os.path.basename(fp).split('.')[1]
        if extension in ['png', 'jpg', 'jpeg']:
            continue
        if fp.startswith('.'):
            continue
        file_name_without_ext = os.path.basename(fp).split('.')[0]

        filename = path + '/' + fp
        doc_dir_location = os.path.join(upload_path, file_name_without_ext)
        if not os.path.exists(doc_dir_location):
            os.makedirs(doc_dir_location)
        result = read_scanned_pdf(filename, doc_dir_location)

        try:
            text_file_path = os.path.join(upload_path, file_name_without_ext,
                                          'texts', 'stitched.txt')

            with open(text_file_path) as fp1:
                contents = fp1.readlines()

            result = parse_all_fields(contents, {})

            print("RESULT**************************>", result)
            if result["username"] == "":
                name_count = name_count + 1
            if result["program_name"] == "":
                program_name_count = program_name_count + 1
コード例 #3
0
    def post(self):
        try:
            save_path = PDF_UPLOAD_DIRECTORY
            file = request.files['file']

            file_name = file.filename.replace(' ', '_')
            file_name_without_ext = os.path.basename(file_name).split('.')[0]
            file_name_without_ext = file_name_without_ext + "_" + str(
                uuid.uuid1())
            file_name = file_name_without_ext + path.splitext(file_name)[1]
            doc_dir_location = os.path.join(save_path, file_name_without_ext)
            if not os.path.exists(doc_dir_location):
                os.makedirs(doc_dir_location)
            print(doc_dir_location)
            file_location = os.path.join(doc_dir_location, file_name)
            print(file_location)
            file.save(file_location)

            #result = read_scanned_pdf(req_payload.get('pdf_path'), req_payload.get('output_dir_location'))
            result = read_scanned_pdf(file_location, doc_dir_location)
            parse_file = result['parse_file']
            template_file = self.create_template(doc_dir_location)
            """
            print("RESULT--->", result)
            stitched_pdf_path = result['stitched_pdf_path']
            template_file_path = os.path.dirname(result['stitched_pdf_path'])
            abby_file_path = os.path.dirname(result['stitched_pdf_path']).replace('pages', 'texts')
            #abby_text_path    = os.path.join(abby_file_path, 'output.txt' )
            abby_text_path    = os.path.join(abby_file_path, 'stitched.txt' )
            print("stitched_pdf_path-->", stitched_pdf_path)
            print("abby_file_path----->", abby_file_path)
            print("abby_text_path----->", abby_text_path)
            #extract_to_docx( stitched_pdf_path, abby_text_path)
            template_file = self.create_template( template_file_path )
            """
            #===START=
            print("==0==", parse_file)
            with open(parse_file, encoding='utf-8') as fp:
                contents = fp.readlines()

            print("==1==")
            obj = ModelPdfData(contents)
            obj.prepare_data()

            obj.compare_with_keywords(keywords)
            obj.list_data()
            print("TEMPLATe_FILE**", template_file)
            ew = ExcelWriter(template_file)
            ew.update(obj.data)
            #===END===

            result['excel_file_path'] = 'excel_file/' + file_name_without_ext
            result['pdf_file_path'] = 'pdf_file/' + file_name_without_ext
            return jsonify({"data": result})
            #return formulate_response(result, 200, "Successfully Extracted")

        except CustomClassifierException as e:
            print("1***ERROR***", e)
            logging.error("Error {} has occurred in controller".format(e))
            return e.response, e.http_code

        except Exception as e:
            print("2***ERROR***", e)
            logging.error("Error in service = {}".format(e), exc_info=True)
            return InternalServerErrorException(
                error_code=500,
                error_message="Data Extraction failed!").response, 500

        finally:
            logging.info("API Call Finished Successfully - 200")