Python PDFParser Examples

Programming Language: Python

Namespace/Package Name: pdfparser

Class/Type: PDFParser

Examples at hotexamples.com: 8

Python PDFParser - 8 examples found. These are the top rated real world Python examples of pdfparser.PDFParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PDFParser(6)

parse(3)

get_processed_stems(2)

set_document(2)

get_text(1)

Example #1

Show file

def process_pdf(rsrcmgr,
                device,
                fp,
                pagenos=None,
                maxpages=0,
                password='',
                check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if check_extractable and not doc.is_extractable:
        raise PDFTextExtractionNotAllowed(
            'Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for (pageno, page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno + 1: break
    return

Example #2

Show file

File: pdfinterp.py Project: ktisha/ebook-service

def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return

Example #3

Show file

def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return

Example #4

Show file

 def get_pages(klass,
               fp,
               pagenos=None,
               maxpages=0,
               password='',
               caching=True,
               check_extractable=True):
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument(parser, caching=caching)
     # Supply the document password for initialization.
     # (If no password is set, give an empty string.)
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if check_extractable and not doc.is_extractable:
         raise klass.PDFTextExtractionNotAllowed(
             'Text extraction is not allowed: %r' % fp)
     # Process each page contained in the document.
     for (pageno, page) in enumerate(klass.create_pages(doc)):
         if pagenos and (pageno not in pagenos):
             continue
         yield page
         if maxpages and maxpages <= pageno + 1:
             break
     return

Example #5

Show file

File: pdfinterp_altered.py Project: srbbins/ETD_Processing_Scripts

def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
                caching=True, check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(caching=caching)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if check_extractable and not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return

Example #6

Show file

File: main.py Project: koshreality/TextExtracting

def pdf_test():
    pdf_parser = PDFParser()
    pdf_parser.parse(r'D:\Test3.pdf')
    print(pdf_parser.get_text())

Example #7

Show file

File: main.py Project: Mashkin232/parsers

def pdf():
    pdf_parser = PDFParser()
    pdf_parser.parse(r'files/Test3.pdf')
    print('pdf parser', pdf_parser.get_processed_stems(),
          len(pdf_parser.get_processed_stems()))

Example #8

Show file

File: test_pdfparser.py Project: Mashkin232/parsers

 def test_PDFParser(self):
     pdf = PDFParser()
     pdf.parse('files/Test.pdf')
     text = []
     assert pdf.get_processed_stems() == text