def test_split_pdf_text(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 4) as block_files: txt1 = str(extract_text_pdfminer(block_files[0])) txt2 = str(extract_text_pdfminer(block_files[1])) txt3 = str(extract_text_pdfminer(block_files[2])) assert 'This is page 1.' in txt1 assert 'This is page 2.' in txt1 assert 'This is page 3.' in txt1 assert 'This is page 4.' in txt1 assert 'This is page 5.' in txt2 assert 'This is page 6.' in txt2 assert 'This is page 7.' in txt2 assert 'This is page 8.' in txt2 assert 'This is page 9.' in txt3 assert 'This is page 1.' not in txt2 assert 'This is page 2.' not in txt3 assert 'This is page 3.' not in txt2 assert 'This is page 4.' not in txt3 assert 'This is page 5.' not in txt1 assert 'This is page 6.' not in txt3 assert 'This is page 7.' not in txt1 assert 'This is page 8.' not in txt3 assert 'This is page 9.' not in txt2 assert len(block_files) == 3
def test_ocr_page(): fn = os.path.join(data_dir, 'ocr1.pdf') txt = '' with extract_page_images(fn) as image_fns: for image in image_fns: with ocr_page_to_pdf(image) as pdf_fn: txt += '\n' + extract_text_pdfminer(pdf_fn) txt = txt.replace(' ', ' ') assert 'each Contributor hereby grants to You' in txt assert 'You may add Your own' in txt assert 'Submission of Contributions' in txt assert 'END OF TERMS AND CONDITIONS' in txt
def assert_pdf(fn: str): txt = extract_text_pdfminer(fn) assert txt.count('document') == 110 with pikepdf.open(fn) as pdf: assert len(pdf.pages) == 3
def assert_pdf(fn: str): txt = extract_text_pdfminer(fn) assert txt.count('This') == 104 with pikepdf.open(fn) as pdf: assert len(pdf.pages) == 2
def assert_pdf(fn: str): txt = extract_text_pdfminer(fn) # xlsx -> pdf conversion for the document which do not fit on the pages # goes not so good but at least all the text should be kept assert txt.count('fitting') == 144 # just counted them in the original xlsx