def test_ocr_rotated_small_angle(): fn = os.path.join(data_dir, 'rotated_small_angle.pdf') with extract_page_images(fn, 1, 1) as png_fns: with ocr_page_to_pdf(png_fns[0]) as pdf_fn: with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s, _d): assert 'rotated' in txt
def p(): fn = '..' with extract_page_images(fn, 54, 54) as image_fns: for image_fn in image_fns: shutil.copy(image_fn, '..') with ocr_page_to_pdf(image_fn) as page_pdf_fn: shutil.copy(page_pdf_fn, '..') text, struct = extract_text_and_structure(page_pdf_fn)
def test_ocr_page(): fn = os.path.join(data_dir, 'ocr1.pdf') txt = '' with extract_page_images(fn) as image_fns: for image in image_fns: with ocr_page_to_pdf(image) as pdf_fn: txt += '\n' + extract_text_pdfminer(pdf_fn) txt = txt.replace(' ', ' ') assert 'each Contributor hereby grants to You' in txt assert 'You may add Your own' in txt assert 'Submission of Contributions' in txt assert 'END OF TERMS AND CONDITIONS' in txt
def p2(): from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks from text_extraction_system.ocr.ocr import ocr_page_to_pdf import shutil orig_pdf_fn = '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C.pdf' page = 1 with split_pdf_to_page_blocks(orig_pdf_fn) as page_fns: page_fn = page_fns[49] with extract_page_ocr_images(page_fn, 1, 1, dpi=300) as images: with ocr_page_to_pdf(images.get(1), glyphless_text_only=True, tesseract_page_orientation_detection=True) as ocred_page_pdf: # type: str with merge_pdf_pages(orig_pdf_fn, single_page_merge_num_file_rotate=(1, ocred_page_pdf, None)) as final_pdf: shutil.copy(page_fn, '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C__00050.ocred.pdf')
def test_table_ocr(): fn = os.path.join(data_dir, 'table1.png') warn_mock = MagicMock('warn') warnings.warn = warn_mock from text_extraction_system.ocr.ocr import ocr_page_to_pdf with ocr_page_to_pdf(fn) as pdf_fn: with open(pdf_fn, 'rb') as ocred_in_file: ocred_page_layout = data_extract.get_first_page_layout( ocred_in_file) camelot_tables = extract_tables(1, ocred_page_layout, fn) assert len(camelot_tables) == 1 warn_mock.assert_not_called()
def process_pdf_page( pdf_fn: str, page_num: int, ocr_enabled: bool = True, ocr_language: str = None, ocr_timeout_sec: int = 60, pdf_password: str = None ) -> Generator[PDFPageProcessingResults, None, None]: with open(pdf_fn, 'rb') as in_file: if ocr_enabled: # Try extracting "no-text" image of the pdf page. # It removes all elements from the page except images having no overlapping # with any text element. # This is used to avoid the text duplication by OCR. with extract_page_ocr_images(pdf_fn, start_page=1, end_page=1, pdf_password=pdf_password, dpi=DPI, reset_page_rotation=False) \ as image_fns: page_image_without_text_fn = image_fns.get( 1) if image_fns else None if page_image_without_text_fn: # this returns a text-based PDF with glyph-less text only # to be used for merging in front of the original PDF page layout with ocr_page_to_pdf( page_image_fn=page_image_without_text_fn, language=ocr_language, timeout=ocr_timeout_sec, glyphless_text_only=True, tesseract_page_orientation_detection=True ) as ocred_text_layer_pdf_fn: # we return only the transparent text layer PDF and not the merged page # because in the final step we will need to merge these transparent layer in front # of the pages in the original PDF file to keep its small size and structure/bookmarks. yield PDFPageProcessingResults( page_requires_ocr=True, ocred_page_fn=ocred_text_layer_pdf_fn) return # if we don't need OCR then yield PDFPageProcessingResults(page_requires_ocr=False)
from text_extraction_system.ocr.ocr import ocr_page_to_pdf from text_extraction_system.pdf.pdf import merge_pdf_pages, log import shutil from text_extraction_system.commons.tests.commons import default_settings from logging import DEBUG with default_settings(): log.setLevel(DEBUG) with ocr_page_to_pdf('/home/mikhail/lexpredict/misc/ocr_complicated1/page_no_text_00034.png', glyphless_text_only=True) as fn: with merge_pdf_pages('/home/mikhail/lexpredict/misc/ocr_complicated1/ocr_complicated1_0034.pdf', single_page_merge_num_file_rotate=(1, fn, None)) as fn1: shutil.copy(fn1, '/home/mikhail/lexpredict/misc/ocr_complicated1/ocred/')