Python ocr_page_to_pdfの例

プログラミング言語: Python

名前空間/パッケージ名: text_extraction_system.ocr.ocr

メソッド/関数: ocr_page_to_pdf

hotexamples.comのコード掲載数: 7

Python ocr_page_to_pdf - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtext_extraction_system.ocr.ocr.ocr_page_to_pdfの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def test_ocr_rotated_small_angle():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        with ocr_page_to_pdf(png_fns[0]) as pdf_fn:
            with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s,
                                                        _d):
                assert 'rotated' in txt

コード例 #2

ファイルを表示

def p():
    fn = '..'
    with extract_page_images(fn, 54, 54) as image_fns:
        for image_fn in image_fns:
            shutil.copy(image_fn, '..')
            with ocr_page_to_pdf(image_fn) as page_pdf_fn:
                shutil.copy(page_pdf_fn, '..')
                text, struct = extract_text_and_structure(page_pdf_fn)

コード例 #3

ファイルを表示

def test_ocr_page():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    txt = ''
    with extract_page_images(fn) as image_fns:
        for image in image_fns:
            with ocr_page_to_pdf(image) as pdf_fn:
                txt += '\n' + extract_text_pdfminer(pdf_fn)
    txt = txt.replace('  ', ' ')
    assert 'each Contributor hereby grants to You' in txt
    assert 'You may add Your own' in txt
    assert 'Submission of Contributions' in txt
    assert 'END OF TERMS AND CONDITIONS' in txt

コード例 #4

ファイルを表示

def p2():
    from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks
    from text_extraction_system.ocr.ocr import ocr_page_to_pdf
    import shutil
    orig_pdf_fn = '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C.pdf'
    page = 1

    with split_pdf_to_page_blocks(orig_pdf_fn) as page_fns:
        page_fn = page_fns[49]
        with extract_page_ocr_images(page_fn, 1, 1, dpi=300) as images:
            with ocr_page_to_pdf(images.get(1),
                                 glyphless_text_only=True,
                                 tesseract_page_orientation_detection=True) as ocred_page_pdf:  # type: str
                with merge_pdf_pages(orig_pdf_fn, single_page_merge_num_file_rotate=(1, ocred_page_pdf, None)) as final_pdf:
                    shutil.copy(page_fn, '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C__00050.ocred.pdf')

コード例 #5

ファイルを表示

ファイル: test_data_extract.py プロジェクト: LexPredict/text-extraction-system

def test_table_ocr():
    fn = os.path.join(data_dir, 'table1.png')
    warn_mock = MagicMock('warn')
    warnings.warn = warn_mock

    from text_extraction_system.ocr.ocr import ocr_page_to_pdf

    with ocr_page_to_pdf(fn) as pdf_fn:
        with open(pdf_fn, 'rb') as ocred_in_file:
            ocred_page_layout = data_extract.get_first_page_layout(
                ocred_in_file)
            camelot_tables = extract_tables(1, ocred_page_layout, fn)

    assert len(camelot_tables) == 1
    warn_mock.assert_not_called()

コード例 #6

ファイルを表示

def process_pdf_page(
    pdf_fn: str,
    page_num: int,
    ocr_enabled: bool = True,
    ocr_language: str = None,
    ocr_timeout_sec: int = 60,
    pdf_password: str = None
) -> Generator[PDFPageProcessingResults, None, None]:
    with open(pdf_fn, 'rb') as in_file:
        if ocr_enabled:
            # Try extracting "no-text" image of the pdf page.
            # It removes all elements from the page except images having no overlapping
            # with any text element.
            # This is used to avoid the text duplication by OCR.
            with extract_page_ocr_images(pdf_fn,
                                         start_page=1,
                                         end_page=1,
                                         pdf_password=pdf_password,
                                         dpi=DPI,
                                         reset_page_rotation=False) \
                    as image_fns:
                page_image_without_text_fn = image_fns.get(
                    1) if image_fns else None
                if page_image_without_text_fn:
                    # this returns a text-based PDF with glyph-less text only
                    # to be used for merging in front of the original PDF page layout
                    with ocr_page_to_pdf(
                            page_image_fn=page_image_without_text_fn,
                            language=ocr_language,
                            timeout=ocr_timeout_sec,
                            glyphless_text_only=True,
                            tesseract_page_orientation_detection=True
                    ) as ocred_text_layer_pdf_fn:
                        # we return only the transparent text layer PDF and not the merged page
                        # because in the final step we will need to merge these transparent layer in front
                        # of the pages in the original PDF file to keep its small size and structure/bookmarks.
                        yield PDFPageProcessingResults(
                            page_requires_ocr=True,
                            ocred_page_fn=ocred_text_layer_pdf_fn)
                        return
        # if we don't need OCR then
        yield PDFPageProcessingResults(page_requires_ocr=False)

コード例 #7

ファイルを表示

ファイル: debug1.py プロジェクト: LexPredict/text-extraction-system

from text_extraction_system.ocr.ocr import ocr_page_to_pdf
from text_extraction_system.pdf.pdf import merge_pdf_pages, log
import shutil
from text_extraction_system.commons.tests.commons import default_settings
from logging import DEBUG

with default_settings():
    log.setLevel(DEBUG)
    with ocr_page_to_pdf('/home/mikhail/lexpredict/misc/ocr_complicated1/page_no_text_00034.png',
                         glyphless_text_only=True) as fn:
        with merge_pdf_pages('/home/mikhail/lexpredict/misc/ocr_complicated1/ocr_complicated1_0034.pdf',
                             single_page_merge_num_file_rotate=(1, fn, None)) as fn1:
            shutil.copy(fn1, '/home/mikhail/lexpredict/misc/ocr_complicated1/ocred/')