コード例 #1
0
ファイル: app.py プロジェクト: evg-cv/PDFInfoExtractor
    def perform_ocr(self):
        # upload_files_len = len(glob.glob(os.path.join(OUTPUT_DIR, "*.pdf")))
        # download_files_len = len(glob.glob(os.path.join(INPUT_DIR, "*.pdf")))
        while True:
            input_files = glob.glob(os.path.join(INPUT_DIR, "*.*"))
            for pdf_path in input_files:
                try:
                    pdf_name = ntpath.basename(pdf_path)
                    extension = pdf_name[pdf_name.rfind(".") + 1:]
                    if extension != "pdf":
                        continue
                    if pdf_name not in self.processed_files:
                        print(f"[INFO] {pdf_name} processing...")
                        extracted_info = self.pdf_extractor.main(
                            pdf_path=pdf_path)
                        output_pdf_path = self.pdf_creator.repopulate_pdf(
                            info=extracted_info, pdf_name=pdf_name)
                        self.s3_manager.upload_files(file_path=output_pdf_path)
                        self.processed_files.append(pdf_name)
                except Exception as e:
                    log_print(e)
            # upload_files_len = len(glob.glob(os.path.join(OUTPUT_DIR, "*.pdf")))
            # download_files_len = len(glob.glob(os.path.join(INPUT_DIR, "*.pdf")))

            content = ""
            for i, file_name in enumerate(self.processed_files):
                if i < len(self.processed_files) - 1:
                    content += file_name + "\n"
                else:
                    content += file_name
            save_file(content=content, filename=PROCESSED_FILE, method='w')
コード例 #2
0
ファイル: creator.py プロジェクト: dl-proj/KeywordGrouper
def create_corpus():

    text_preprocessor = TextPreprocessor()

    keyword_df = pd.read_csv(KEYWORD_CSV_FILE_PATH)
    keywords = keyword_df["Keyword"].values.tolist()
    word_freq = {}

    for keyword in keywords:
        token_words = text_preprocessor.tokenize_word(sample=keyword)
        for t_word in token_words:
            if t_word == "":
                continue
            if not t_word.isalpha():
                continue
            if t_word not in word_freq.keys():
                word_freq[t_word] = 1
            else:
                word_freq[t_word] += 1

    save_file(filename=CORPUS_PATH,
              content=json.dumps(word_freq, indent=4),
              method="w")
    print(f"[INFO] Successfully saved corpus in {CORPUS_PATH}")

    return
コード例 #3
0
    def extract_ocr_local(self, frame_path):

        file_name = ntpath.basename(frame_path).replace(".jpg", "")
        # Read document content
        with open(frame_path, 'rb') as document:
            image_bytes = bytearray(document.read())

        # Call Amazon Textract
        response = self.textract.detect_document_text(
            Document={'Bytes': image_bytes})
        json_file_path = os.path.join(CUR_DIR, 'test_json',
                                      f"temp_{file_name}.json")
        save_file(filename=json_file_path,
                  content=json.dumps(response, indent=4),
                  method="w")

        return
コード例 #4
0
def process_ocr_text(frame_path):

    container = ""

    temp_paths = separate_frame_by_size(f_path=frame_path)

    for i, t_path in enumerate(temp_paths):

        image_ocr_json = google_ocr.detect_text(img_path=t_path)
        if LOCAL:
            json_file_path = os.path.join(CUR_DIR, 'temp', "temp_{}_{}.json".format(
                ntpath.basename(frame_path).replace(".jpg", ""), ntpath.basename(t_path).replace(".jpg", "")))
            save_file(filename=json_file_path, content=json.dumps(image_ocr_json), method="w")

        container += extract_text_from_json(json_content_=image_ocr_json, path_=t_path, part_idx=i) + "\n"

    return container
コード例 #5
0
ファイル: result.py プロジェクト: starpolar/well_ocr
    def process_ocr_text(self, frame_path, file_name):

        sharpen_frame_path = convert_image_color(frame_path=frame_path,
                                                 file_name=file_name)
        image_ocr_json = self.google_ocr.detect_text(path=sharpen_frame_path)

        if LOCAL:
            json_file_path = os.path.join(CUR_DIR, 'temp',
                                          "temp_{}.json".format(file_name))
            save_file(filename=json_file_path,
                      content=json.dumps(image_ocr_json),
                      method="w")

        self.extract_whole_info(frame_path=sharpen_frame_path,
                                json_data=image_ocr_json)
        save_path = import_info_into_excel(info=self.info, file_name=file_name)

        return save_path
コード例 #6
0
def process_ocr_text(frame_path):

    image_ocr_json = google_ocr.detect_text(img_path=frame_path)

    if LOCAL:
        json_file_path = os.path.join(CUR_DIR, 'temp', "temp.json")
        save_file(filename=json_file_path,
                  content=json.dumps(image_ocr_json),
                  method="w")

    content = extract_table_content(json_content=image_ocr_json,
                                    frame_path=frame_path)

    table_text = ""
    for row_id in content.keys():
        for col_id in content[row_id].keys():
            table_text += "'" + content[row_id][col_id] + "'" + ","
        table_text += "\n"

    return table_text
コード例 #7
0
def calculate_optimum_threshold(f_array, t_array, thresh_array):

    opt_index = None
    diff = abs(f_array[0] - OPT_THRESH)
    for i, f_value in enumerate(f_array):
        if f_value > OPT_THRESH:
            continue
        if abs(f_value - OPT_THRESH) < diff:
            opt_index = i
            diff = abs(f_value - OPT_THRESH)

    if opt_index is not None:

        opt_fpr = f_array[opt_index]
        opt_thresh = thresh_array[opt_index]
        opt_tpr = t_array[opt_index]

        print("Optimum Threshold for test data: ", opt_thresh)
        print("test_data FPR: ", opt_fpr)
        print("test_data TPR: ", opt_tpr)

        save_file(content=str(opt_thresh), filename=OPT_THRESH_PATH, method='w')
コード例 #8
0
ファイル: app.py プロジェクト: dl-proj/InvoicePDFAPI
def download_files(invoice_name_str):
    if g.user:
        total_invoice_info = {}
        invoices = invoice_name_str.split(",")[:-1]
        for f_invoice in invoices:
            f_invoice_info = db_manager.get_invoice_info(file_name=f_invoice)
            total_invoice_info[f_invoice] = {
                "Barcode": f_invoice_info[0],
                "Lieferschein_Nr": f_invoice_info[1],
                "DTS_Date": f_invoice_info[2],
                "DTS_Time": f_invoice_info[3],
                "Gewicht": f_invoice_info[4],
                "Volume": f_invoice_info[5],
                "Fuhre": f_invoice_info[6]
            }

        output_file_path = os.path.join(OUTPUT_DIR, 'result.json')
        save_file(filename=output_file_path,
                  content=json.dumps(total_invoice_info, indent=4),
                  method="w")

        return send_file(output_file_path, as_attachment=True)
    else:
        return render_template('login.html')
コード例 #9
0
ファイル: google_ocr.py プロジェクト: dl-proj/InvoicePDFAPI
    def detect_text(self, img_path, file_name, dir_name=None):
        """ Uses the Vision API to detect text in the given file. """

        request_list = []
        feature_type = 'DOCUMENT_TEXT_DETECTION'

        with open(img_path, 'rb') as img_file:
            content_json_obj = {
                'content': base64.b64encode(img_file.read()).decode('UTF-8')
            }

            feature_json_obj = [{'type': feature_type}]

            request_list.append({
                'image': content_json_obj,
                'features': feature_json_obj
            })

        request = self.service.images().annotate(
            body={'requests': request_list})

        try:
            response = request.execute()

            ret_json = response['responses'][0]
            if LOCAL:
                json_file_path = os.path.join(
                    CUR_DIR, 'test_json', f"temp_{dir_name}_{file_name}.json")
                save_file(filename=json_file_path,
                          content=json.dumps(ret_json),
                          method="w")

            return ret_json

        except Exception as e2:
            print("Key error: %s" % e2)
コード例 #10
0
def split_dataset_darknet():

    img_dir = os.path.join(CUR_DIR, 'darknet', 'custom_data', 'images')
    image_paths = glob.glob(os.path.join(img_dir, "*.jpg"))
    xml_paths = glob.glob(
        os.path.join(CUR_DIR, 'training_dataset', 'xml', '*.txt'))
    image_indices = []

    for image_path in image_paths:

        _, index = get_index_from_file_path(path=image_path)
        if index != "":
            image_indices.append(index)

    for xml_path in xml_paths:

        xml_name, xml_index = get_index_from_file_path(path=xml_path)
        if xml_index in image_indices:

            xml_content = load_text(filename=xml_path)
            xml_content = xml_content.replace("15", "0")
            new_file_path = os.path.join(CUR_DIR, 'darknet', 'custom_data',
                                         'images', xml_name)
            save_file(content=xml_content, filename=new_file_path, method='w')

    shuffle(image_indices)
    train_index = int(0.8 * len(image_indices))
    training_indices, test_indices = image_indices[:
                                                   train_index], image_indices[
                                                       train_index:]

    for idx in training_indices:

        path = "custom_data/images/image_{}.jpg".format(idx) + "\n"
        save_file(content=path,
                  filename=os.path.join(CUR_DIR, 'darknet', 'custom_data',
                                        'train.txt'),
                  method='a')

    for idx in test_indices:

        path = "custom_data/images/image_{}.jpg".format(idx) + "\n"
        save_file(content=path,
                  filename=os.path.join(CUR_DIR, 'darknet', 'custom_data',
                                        'test.txt'),
                  method='a')
コード例 #11
0
    processed_file_names = []

    for processed_f_path in processed_files:
        f_name = extract_file_name(file_path=processed_f_path)
        processed_file_names.append(f_name)

    total_lens = len(input_image_path)
    for i, path in enumerate(input_image_path):

        file_name = ntpath.basename(path).replace(".jpg", "")
        if file_name in processed_file_names:
            continue

        print("Process {}-({} / {})".format(path, i + 1, total_lens))
        try:
            frame_content = process_ocr_text(frame_path=path)
            txt_file_path = os.path.join(OUTPUT_DIR,
                                         "{}.txt".format(file_name))
            save_file(content=frame_content,
                      filename=txt_file_path,
                      method='w')
            log_print(info_str=path + "\n" + "Successfully processed")
            print("Successfully processed {}".format(path))

        except Exception as e:
            log_print(info_str=path)
            log_print(info_str=e)

    for jpg_path in glob.glob(os.path.join(OUTPUT_DIR, "*.jpg")):
        os.remove(jpg_path)
コード例 #12
0
    def extract_page_info(self, pdf_page_frame_path, file_name=None, index=None, ocr_result=None):
        if os.path.exists(os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}_{index}.json")):
            with open(os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}_{index}.json")) as f:
                ocr_result = json.load(f)
        if ocr_result is None:
            ocr_result = self.ocr_tool.detect_text(img_path=pdf_page_frame_path)
            if LOCAL:
                json_file_path = os.path.join(CUR_DIR, 'test_json', f"temp_{file_name}_{index}.json")
                save_file(filename=json_file_path, content=json.dumps(ocr_result), method="w")

        pdf_page_frame = cv2.imread(pdf_page_frame_path)
        height, width = pdf_page_frame.shape[:2]
        page_title_frame = pdf_page_frame[:int(height / 8), :]
        title_frame_path = os.path.join(CUR_DIR, 'title.jpg')
        cv2.imwrite(title_frame_path, page_title_frame)

        title_ocr = self.ocr_tool.detect_text(img_path=title_frame_path)
        title_json = title_ocr["textAnnotations"][1:]
        needed_ocr = ocr_result["textAnnotations"][1:]
        self.box_row_lines, self.box_col_lines = extract_box_lines(frame_path=pdf_page_frame_path,
                                                                   json_info=needed_ocr)

        for j, pdf_json in enumerate(title_json):
            if pdf_json["description"].lower() == "ncic":
                if not self.report_page_ret:
                    report_number, crash_date_time, crash_severity, county, ncic, agency, units, unit_error = \
                        self.extract_report_page(json_info=needed_ocr, frame_path=pdf_page_frame_path)
                    # min_dist = needed_ocr[0]["boundingPoly"]["vertices"][0]["x"] + \
                    #            needed_ocr[0]["boundingPoly"]["vertices"][0]["y"]
                    # state = needed_ocr[0]["description"]
                    # for _json in needed_ocr[1:]:
                    #     dist = _json["boundingPoly"]["vertices"][0]["x"] + _json["boundingPoly"]["vertices"][0]["y"]
                    #     if dist < min_dist:
                    #         min_dist = dist
                    #         state = _json["description"]
                    # self.pdf_info["report"]["state"] = state
                    county = str(county).replace(",", "").replace(".", "")
                    if len(county) > 2:
                        county = county[:2]
                    self.pdf_info["report"]["report_number"] = \
                        str(report_number).replace(",", "").replace(".", "").replace(":", "")
                    self.pdf_info["report"]["crash_date_time"] = \
                        str(crash_date_time).replace(",", "").replace(".", "").replace(":", "")
                    # self.pdf_info["report"]["crash_date"] = str(crash_date)
                    # self.pdf_info["report"]["crash_time"] = str(crash_time)
                    self.pdf_info["report"]["county"] = county
                    self.pdf_info["report"]["agency_ncic"] = \
                        str(ncic).replace(",", "").replace(".", "").replace(":", "")
                    self.pdf_info["report"]["agency"] = str(agency).replace(",", "").replace(".", "").replace(":", "")
                    self.pdf_info["report"]["number_of_unit"] = \
                        str(units).replace(",", "").replace(".", "").replace(":", "")
                    self.pdf_info["report"]["unit_in_error"] = \
                        str(unit_error).replace(",", "").replace(".", "").replace(":", "")
                    self.pdf_info["report"]["crash_severity"] = \
                        str(crash_severity).replace(",", "").replace(".", "").replace(":", "")
                    self.report_page_ret = True
                    break
            elif pdf_json["description"].lower() == "unit" and abs(pdf_json["boundingPoly"]["vertices"][0]["y"] -
                                                                   pdf_json["boundingPoly"]["vertices"][3]["y"]) > 30:
                temp_dict = {}
                unit_number, owner_name, owner_address, owner_phone, occupants, damage_scale, insurance_company, \
                    policy_number, year, make, model, unit_type = \
                    self.extract_unit_page(json_info=needed_ocr, frame_path=pdf_page_frame_path)
                temp_dict["unit_number"] = unit_number.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["owner_name"] = owner_name.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["owner_address"] = owner_address.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["owner_phone"] = owner_phone.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["number_of_occupants"] = occupants.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["damage_scale"] = damage_scale.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["insurance_company"] = insurance_company.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["policy_number"] = policy_number.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["year"] = year.replace(",", "").replace(".", "").replace(":", "")[-4:]
                temp_dict["make"] = make.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["model"] = model.replace(",", "").replace(".", "").replace(":", "")
                temp_dict["unit_type"] = unit_type.replace(",", "").replace(".", "").replace(":", "")
                blank_page = True
                for t_key in temp_dict.keys():
                    if temp_dict[t_key] != "":
                        blank_page = False
                        break
                if not blank_page:
                    self.pdf_info["unit"].append(temp_dict)
                break
            elif pdf_json["description"].lower() == "motorist" or pdf_json["description"].lower() == "occupant":
                info = self.extract_motorist_occupant_page(json_info=needed_ocr, frame_path=pdf_page_frame_path)
                if pdf_json["description"].lower() == "motorist":
                    self.pdf_info["motorist"] = info
                else:
                    self.pdf_info["occupant"] = info
                break

        os.remove(title_frame_path)

        return self.pdf_info
コード例 #13
0
from src.ocr.ocr_text import process_ocr_text
from utils.folder_file_manager import save_file
from settings import RESULT_FILE_PATH, INPUT_IMAGE_PATH


if __name__ == '__main__':

    result = process_ocr_text(frame_path=INPUT_IMAGE_PATH)
    save_file(content=result, filename=RESULT_FILE_PATH, method='w')
    print("Successfully saved in {}".format(RESULT_FILE_PATH))