Example #1
0
 def test_get_text_lines_skip_parse_faulty_broadcom_doc(self):
     test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf')
     try:
         pdf.get_text_lines(test_file)
         self.fail('Exception should be thrown on faulty PDF')
     except:
         pass
Example #2
0
def unicode_text_lines_from_pdf(location):
    """
    Return an iterable over unicode text lines extracted from a pdf file at
    location.
    """
    for line in pdf.get_text_lines(location):
        yield as_unicode(line)
Example #3
0
def unicode_text_lines_from_pdf(location):
    """
    Return an iterable over unicode text lines extracted from a pdf file at
    location.
    """
    for line in pdf.get_text_lines(location):
        yield as_unicode(line)
Example #4
0
    def test_get_text_lines(self):
        test_file = self.get_test_loc('pdf/pdf.pdf')
        result = pdf.get_text_lines(test_file)
        expected = u'''pdf

"""
Extracts text from a pdf file.
"""
import contextlib
from StringIO import StringIO

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter

def get_text(location):
    rs_mgr = PDFResourceManager()
    extracted_text = StringIO()
    with contextlib.closing(TextConverter(rs_mgr, extracted_text)) as extractor:
        with open(location, \'rb\') as pdf_file:
            interpreter = PDFPageInterpreter(rs_mgr, extractor)
            pages = PDFPage.get_pages(pdf_file, check_extractable=True)
            for page in pages:
                interpreter.process_page(page)
    return extracted_text

Page 1

\x0c'''.splitlines(True)

        assert expected == result
Example #5
0
    def test_get_text_lines(self):
        test_file = self.get_test_loc('pdf/pdf.pdf')
        result = pdf.get_text_lines(test_file)
        expected = b'''pdf

"""
Extracts text from a pdf file.
"""
import contextlib
from StringIO import StringIO

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter

def get_text(location):
    rs_mgr = PDFResourceManager()
    extracted_text = StringIO()
    with contextlib.closing(TextConverter(rs_mgr, extracted_text)) as extractor:
        with open(location, \'rb\') as pdf_file:
            interpreter = PDFPageInterpreter(rs_mgr, extractor)
            pages = PDFPage.get_pages(pdf_file, check_extractable=True)
            for page in pages:
                interpreter.process_page(page)
    return extracted_text

Page 1

\x0c'''.splitlines(True)

        assert expected == result
Example #6
0
    def test_pdfminer_can_parse_apache_fop_test_pdf(self):
        test_file = self.get_test_loc('pdf/fop_test_pdf_1.5_test.pdf')
        from pdfminer.pdfparser import PDFParser
        from pdfminer.pdfdocument import PDFDocument
        with open(test_file, 'rb') as inputfile:
            parser = PDFParser(inputfile)
            PDFDocument(parser)

        result = pdf.get_text_lines(test_file)
        expected = apache_fop_expected
        assert expected == result
Example #7
0
    def test_pdfminer_cant_parse_apache_fop_test_pdf(self):
        test_file = self.get_test_loc('pdf/fop_test_pdf_1.5_test.pdf')
        from pdfminer.pdfparser import PDFParser
        from pdfminer.pdfdocument import PDFDocument
        with open(test_file, 'rb') as inputfile:
            parser = PDFParser(inputfile)
            PDFDocument(parser)

        result = pdf.get_text_lines(test_file)
        expected = apache_fop_expected
        assert expected == result
Example #8
0
 def test_get_text_lines_can_parse_faulty_broadcom_doc(self):
     test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf')
     result = list(pdf.get_text_lines(test_file))
     expected = [
         b'Programmer\xe2\x80\x99s Guide\n', b'BCM5756M\n', b'\n',
         b'Host Programmer Interface Specification for the \n',
         b'NetXtreme\xc2\xae and NetLink\xe2\x84\xa2 Family of Highly \n',
         b'Integrated Media Access Controllers\n', b'\n',
         b'5300 California Avenue  \xe2\x80\xa2  Irvine, CA 92617  (cid:129)  Phone: 949-926-5000  (cid:129)  Fax: 949-926-5203\n',
         b'\n', b'5756M-PG101-R\n', b'\n', b'10/15/07\n', b'\n', b'\x0c'
     ]
     assert expected == result
Example #9
0
 def test_get_text_lines_skip_parse_faulty_broadcom_doc(self):
     test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf')
     result = list(pdf.get_text_lines(test_file))
     expected = [
         b'Programmer\xe2\x80\x99s Guide\n',
         b'BCM5756M\n', b'\n',
         b'Host Programmer Interface Specification for the \n',
         b'NetXtreme\xc2\xae and NetLink\xe2\x84\xa2 Family of Highly \n',
         b'Integrated Media Access Controllers\n',
         b'\n',
         b'5300 California Avenue  \xe2\x80\xa2  Irvine, CA 92617  (cid:129)  Phone: 949-926-5000  (cid:129)  Fax: 949-926-5203\n',
         b'\n',
         b'5756M-PG101-R\n',
         b'\n',
         b'10/15/07\n',
         b'\n',
         b'\x0c']
     assert expected == result
Example #10
0
 def test_pdfminer_can_parse_apache_fop_test_pdf(self):
     test_file = self.get_test_loc('pdf/fop_test_pdf_1.5_test.pdf')
     result = pdf.get_text_lines(test_file)
     for expected in apache_fop_expected:
         assert expected in result
Example #11
0
 def test_get_text_lines_skip_parse_faulty_broadcom_doc(self):
     # test for
     test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf')
     result = pdf.get_text_lines(test_file)
     assert [] == result