def test_get_text_lines_skip_parse_faulty_broadcom_doc(self): test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf') try: pdf.get_text_lines(test_file) self.fail('Exception should be thrown on faulty PDF') except: pass
def unicode_text_lines_from_pdf(location): """ Return an iterable over unicode text lines extracted from a pdf file at location. """ for line in pdf.get_text_lines(location): yield as_unicode(line)
def unicode_text_lines_from_pdf(location): """ Return an iterable over unicode text lines extracted from a pdf file at location. """ for line in pdf.get_text_lines(location): yield as_unicode(line)
def test_get_text_lines(self): test_file = self.get_test_loc('pdf/pdf.pdf') result = pdf.get_text_lines(test_file) expected = u'''pdf """ Extracts text from a pdf file. """ import contextlib from StringIO import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter def get_text(location): rs_mgr = PDFResourceManager() extracted_text = StringIO() with contextlib.closing(TextConverter(rs_mgr, extracted_text)) as extractor: with open(location, \'rb\') as pdf_file: interpreter = PDFPageInterpreter(rs_mgr, extractor) pages = PDFPage.get_pages(pdf_file, check_extractable=True) for page in pages: interpreter.process_page(page) return extracted_text Page 1 \x0c'''.splitlines(True) assert expected == result
def test_get_text_lines(self): test_file = self.get_test_loc('pdf/pdf.pdf') result = pdf.get_text_lines(test_file) expected = b'''pdf """ Extracts text from a pdf file. """ import contextlib from StringIO import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter def get_text(location): rs_mgr = PDFResourceManager() extracted_text = StringIO() with contextlib.closing(TextConverter(rs_mgr, extracted_text)) as extractor: with open(location, \'rb\') as pdf_file: interpreter = PDFPageInterpreter(rs_mgr, extractor) pages = PDFPage.get_pages(pdf_file, check_extractable=True) for page in pages: interpreter.process_page(page) return extracted_text Page 1 \x0c'''.splitlines(True) assert expected == result
def test_pdfminer_can_parse_apache_fop_test_pdf(self): test_file = self.get_test_loc('pdf/fop_test_pdf_1.5_test.pdf') from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument with open(test_file, 'rb') as inputfile: parser = PDFParser(inputfile) PDFDocument(parser) result = pdf.get_text_lines(test_file) expected = apache_fop_expected assert expected == result
def test_pdfminer_cant_parse_apache_fop_test_pdf(self): test_file = self.get_test_loc('pdf/fop_test_pdf_1.5_test.pdf') from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument with open(test_file, 'rb') as inputfile: parser = PDFParser(inputfile) PDFDocument(parser) result = pdf.get_text_lines(test_file) expected = apache_fop_expected assert expected == result
def test_get_text_lines_can_parse_faulty_broadcom_doc(self): test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf') result = list(pdf.get_text_lines(test_file)) expected = [ b'Programmer\xe2\x80\x99s Guide\n', b'BCM5756M\n', b'\n', b'Host Programmer Interface Specification for the \n', b'NetXtreme\xc2\xae and NetLink\xe2\x84\xa2 Family of Highly \n', b'Integrated Media Access Controllers\n', b'\n', b'5300 California Avenue \xe2\x80\xa2 Irvine, CA 92617 (cid:129) Phone: 949-926-5000 (cid:129) Fax: 949-926-5203\n', b'\n', b'5756M-PG101-R\n', b'\n', b'10/15/07\n', b'\n', b'\x0c' ] assert expected == result
def test_get_text_lines_skip_parse_faulty_broadcom_doc(self): test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf') result = list(pdf.get_text_lines(test_file)) expected = [ b'Programmer\xe2\x80\x99s Guide\n', b'BCM5756M\n', b'\n', b'Host Programmer Interface Specification for the \n', b'NetXtreme\xc2\xae and NetLink\xe2\x84\xa2 Family of Highly \n', b'Integrated Media Access Controllers\n', b'\n', b'5300 California Avenue \xe2\x80\xa2 Irvine, CA 92617 (cid:129) Phone: 949-926-5000 (cid:129) Fax: 949-926-5203\n', b'\n', b'5756M-PG101-R\n', b'\n', b'10/15/07\n', b'\n', b'\x0c'] assert expected == result
def test_pdfminer_can_parse_apache_fop_test_pdf(self): test_file = self.get_test_loc('pdf/fop_test_pdf_1.5_test.pdf') result = pdf.get_text_lines(test_file) for expected in apache_fop_expected: assert expected in result
def test_get_text_lines_skip_parse_faulty_broadcom_doc(self): # test for test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf') result = pdf.get_text_lines(test_file) assert [] == result