def split_to_images_and_ocr(
    pdf_path,
    google_key='/home/vvasuki/sysconf/kunchikA/google/sanskritnlp/service_account_key.json'
):
    final_ocr_path = pdf_path + ".txt"
    if os.path.exists(final_ocr_path):
        logging.warning("Skipping %s: %s exists", pdf_path, final_ocr_path)
        return
    image_directory = _get_ocr_dir(pdf_path)
    os.makedirs(image_directory, exist_ok=True)
    pdf.dump_images(pdf_path, image_directory)
    image_segments = [
        str(pdf_segment)
        for pdf_segment in Path(_get_ocr_dir(pdf_path)).glob("*.jpg")
    ]
    ocr_segments = sorted([img + ".txt" for img in image_segments])
    drive_client = drive.get_cached_client(google_key=google_key)
    for image_segment in sorted(image_segments):
        drive_client.ocr_file(local_file_path=str(image_segment))
        # os.remove(image_segment)
        time.sleep(1)

    # Combine the ocr segments
    file_helper.concatenate_files(input_path_list=ocr_segments,
                                  output_path=final_ocr_path)
    file_helper.clear_bad_chars_in_file(file_path=final_ocr_path)
Example #2
0
def join_babylon_segments_in_dir(out_path_dir):
    final_babylon_dir = Path(out_path_dir).parent
    final_babylon_name = os.path.basename(final_babylon_dir) + ".babylon"
    input_files = list(Path(out_path_dir).glob("*.babylon"))
    input_files.sort()
    file_helper.concatenate_files(input_path_list=input_files,
                                  output_path=os.path.join(
                                      final_babylon_dir, final_babylon_name))
Example #3
0
def split_and_ocr_on_drive(pdf_path, google_key='/home/vvasuki/sysconf/kunchikA/google/sanskritnlp/service_account_key.json', 
        small_pdf_pages=25, start_page=1, end_page=None, pdf_compression_power=0):
    """
    OCR some pdf with google drive. Automatically splits into 25 page bits and ocrs them individually.
    
    We compress the pdf provided (if compression_power>0) because:

     -  If drive API detects text in your pdf it won't OCR the image and will just return the text it found
     - If a PDF has layers, google drive ocr fails. Need to print into a pdf in such a case. 
     - One does not need insane resolution to OCR. I guessed that file size and/or resolution is a critical factor in determining if OCR via Drive API succeeds.

    However, pdf compression results in reduction in OCR accuracy. So, beware that tradeoff.

    Still, sometimes, the operation may time out, or you might get an Internal service error. In that case, try reducing small_pdf_pages or increasing the compression power.
    
    :param pdf_path:
    :param google_key: A json key which can be obtained from https://console.cloud.google.com/iam-admin/serviceaccounts (create a project, generate a key via "Actions" column.). 
    :param small_pdf_pages: Number of pages per segment - an argument used for splitting the pdf into small bits for OCR-ing. 
    :param pdf_compression_power: 0,1,2,3,4
    :return: 
    """
    
    compressed_pdf_path = pdf_path.replace(".pdf", "_tiny.pdf")
    compress_with_gs(input_file_path=pdf_path, output_file_path=compressed_pdf_path, power=pdf_compression_power)
    split_into_small_pdfs(pdf_path=compressed_pdf_path, small_pdf_pages=small_pdf_pages, start_page=start_page, end_page=end_page)
    
    # Do the OCR
    from curation_utils.google import drive
    drive_client = drive.get_cached_client(google_key=google_key)
    pdf_segments = [str(pdf_segment) for pdf_segment  in Path(_get_ocr_dir(compressed_pdf_path)).glob("*.pdf")]
    ocr_segments = sorted([pdf_segment + ".txt" for pdf_segment in pdf_segments])
    for pdf_segment in sorted(pdf_segments):
        drive_client.ocr_file(local_file_path=str(pdf_segment))
        os.remove(pdf_segment)
    
    # Combine the ocr segments
    final_ocr_path = pdf_path + ".txt"
    file_helper.concatenate_files(input_path_list=ocr_segments, output_path=final_ocr_path)
Example #4
0
def split_and_ocr_on_drive(
        pdf_path,
        google_key='/home/vvasuki/sysconf/kunchikA/google/sanskritnlp/service_account_key.json',
        small_pdf_pages=25,
        start_page=1,
        end_page=None):
    """
    OCR some pdf with google drive. Automatically splits into 25 page bits and ocrs them individually.
    
    Sometimes, the operation may time out, or you might get an Internal service error. In that case, try reducing small_pdf_pages.
    
    :param pdf_path:
    :param google_key: A json key which can be obtained from https://console.cloud.google.com/iam-admin/serviceaccounts (create a project, generate a key via "Actions" column.). 
    :param small_pdf_pages: Number of pages per segment - an argument used for splitting the pdf into small bits for OCR-ing. 
    :return: 
    """
    # TODO: If a PDF has layers, google drive ocr fails. Need to print into a pdf in such a case.
    # TODO: One does not need insane resolution to OCR. I guessed that file size and/or resolution is a critical factor in determining if OCR succeeds.  Could someone contribute python code to reduce the pdf resolution if it is too much?
    split_into_small_pdfs(pdf_path=pdf_path,
                          small_pdf_pages=small_pdf_pages,
                          start_page=start_page,
                          end_page=end_page)

    # Do the OCR
    from curation_utils.google import drive
    drive_client = drive.get_cached_client(google_key=google_key)
    pdf_segments = Path(_get_ocr_dir(pdf_path)).glob("*.pdf")
    ocr_segments = sorted(
        [str(pdf_path) + ".txt" for pdf_path in pdf_segments])
    for pdf_segment in sorted(pdf_segments):
        drive_client.ocr_file(local_file_path=str(pdf_segment))
        os.remove(str(pdf_segment))

    # Combine the ocr segments
    final_ocr_path = pdf_path + ".txt"
    file_helper.concatenate_files(input_path_list=ocr_segments,
                                  output_path=final_ocr_path)
Example #5
0
def ocr_with_path(root_path):
    # Create a list of pdf_path
    if os.path.isdir(root_path):
        pdf_list = [
            os.path.join(root_path, item) for item in os.listdir(root_path)
            if item.endswith(".pdf")
        ]
    elif os.path.isfile(root_path) and root_path.endswith("pdf"):
        pdf_list = [root_path]
    #
    # for each pdf in the pdf_list, do this
    for pdf_path in pdf_list:
        # final OCR file name
        ocr_file_for_pdf_path = pdf_path + ".txt"
        #
        # break the pdf to small pdf of 25 pages using a fn
        small_pdf_paths = break_to_small_pdf_paths_original(
            pdf_path,
            output_directory=None,
            start_page=1,
            end_page=None,
            small_pdf_pages=25,
        )
        # a list for all ocr relaetd to each small_pdf_path
        small_pdf_final_ocr_path_list = []
        # for each 25 pages PDF related to each pdf_path, do this
        for each_small_pdf_path in small_pdf_paths:
            try:
                # get pdf page numbers
                pdf_info = pdfinfo_from_path(each_small_pdf_path,
                                             userpw=None,
                                             poppler_path=None)
                maxPages = pdf_info["Pages"]
                # do ocr for each 25 page PDF by compressing and splitting in 5 pages PDF
                drive_ocr.split_and_ocr_on_drive(
                    pdf_path=each_small_pdf_path,
                    google_key="/path/to/key.json",
                    small_pdf_pages=5,
                    # start_page=None,
                    # start_page=page,
                    # end_page = None,
                    # end_page = min(page+10-1, maxPages),
                    pdf_compression_power=
                    1,  #change it as desire for more/less compression of PDF
                    # detext=True
                )
                # add ocr of each 5 page PDF related to each 25 pages PDF to a list
                each_small_pdf_ocr_path = each_small_pdf_path + ".txt"
                small_pdf_final_ocr_path_list.append(each_small_pdf_ocr_path)
            except Exception as x:
                print(x)
                # print('trying another method now.')
                # drive_ocr.split_to_images_and_ocr(
                #     pdf_path=pdf_path,
                #     google_key="/path/to/key.json",
                #     # small_pdf_pages=10,
                #     # end_page = None,
                #     # pdf_compression_power=3,
                #     # start_page = 1,
                #     #detext=True
                #     )
                continue
        # Combine the ocr segments
        small_pdf_final_ocr_path_list.sort()
        file_helper.concatenate_files(
            input_path_list=small_pdf_final_ocr_path_list,
            output_path=ocr_file_for_pdf_path,
        )
        # clear useless characters
        file_helper.clear_bad_chars_in_file(file_path=ocr_file_for_pdf_path)
        # delete useless PDF and split ocr
        output_directory = os.path.join(
            os.path.dirname(pdf_path),
            Path(pdf_path).stem + "_small_originals")
        if os.path.isdir(output_directory):
            shutil.rmtree(output_directory)