コード例 #1
0
def test_split_pdf_text():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 4) as block_files:
        txt1 = str(extract_text_pdfminer(block_files[0]))
        txt2 = str(extract_text_pdfminer(block_files[1]))
        txt3 = str(extract_text_pdfminer(block_files[2]))

        assert 'This is page 1.' in txt1
        assert 'This is page 2.' in txt1
        assert 'This is page 3.' in txt1
        assert 'This is page 4.' in txt1
        assert 'This is page 5.' in txt2
        assert 'This is page 6.' in txt2
        assert 'This is page 7.' in txt2
        assert 'This is page 8.' in txt2
        assert 'This is page 9.' in txt3

        assert 'This is page 1.' not in txt2
        assert 'This is page 2.' not in txt3
        assert 'This is page 3.' not in txt2
        assert 'This is page 4.' not in txt3
        assert 'This is page 5.' not in txt1
        assert 'This is page 6.' not in txt3
        assert 'This is page 7.' not in txt1
        assert 'This is page 8.' not in txt3
        assert 'This is page 9.' not in txt2

        assert len(block_files) == 3
コード例 #2
0
def test_ocr_page():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    txt = ''
    with extract_page_images(fn) as image_fns:
        for image in image_fns:
            with ocr_page_to_pdf(image) as pdf_fn:
                txt += '\n' + extract_text_pdfminer(pdf_fn)
    txt = txt.replace('  ', ' ')
    assert 'each Contributor hereby grants to You' in txt
    assert 'You may add Your own' in txt
    assert 'Submission of Contributions' in txt
    assert 'END OF TERMS AND CONDITIONS' in txt
コード例 #3
0
 def assert_pdf(fn: str):
     txt = extract_text_pdfminer(fn)
     assert txt.count('document') == 110
     with pikepdf.open(fn) as pdf:
         assert len(pdf.pages) == 3
コード例 #4
0
 def assert_pdf(fn: str):
     txt = extract_text_pdfminer(fn)
     assert txt.count('This') == 104
     with pikepdf.open(fn) as pdf:
         assert len(pdf.pages) == 2
コード例 #5
0
 def assert_pdf(fn: str):
     txt = extract_text_pdfminer(fn)
     # xlsx -> pdf conversion for the document which do not fit on the pages
     # goes not so good but at least all the text should be kept
     assert txt.count('fitting') == 144  # just counted them in the original xlsx