Beispiel #1
0
def process_pdf(rsrcmgr,
                device,
                fp,
                pagenos=None,
                maxpages=0,
                password='',
                check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if check_extractable and not doc.is_extractable:
        raise PDFTextExtractionNotAllowed(
            'Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for (pageno, page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno + 1: break
    return
Beispiel #2
0
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return
Beispiel #3
0
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return
Beispiel #4
0
 def get_pages(klass,
               fp,
               pagenos=None,
               maxpages=0,
               password='',
               caching=True,
               check_extractable=True):
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument(parser, caching=caching)
     # Supply the document password for initialization.
     # (If no password is set, give an empty string.)
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if check_extractable and not doc.is_extractable:
         raise klass.PDFTextExtractionNotAllowed(
             'Text extraction is not allowed: %r' % fp)
     # Process each page contained in the document.
     for (pageno, page) in enumerate(klass.create_pages(doc)):
         if pagenos and (pageno not in pagenos):
             continue
         yield page
         if maxpages and maxpages <= pageno + 1:
             break
     return
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
                caching=True, check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(caching=caching)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if check_extractable and not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return
Beispiel #6
0
def pdf_test():
    pdf_parser = PDFParser()
    pdf_parser.parse(r'D:\Test3.pdf')
    print(pdf_parser.get_text())
Beispiel #7
0
def pdf():
    pdf_parser = PDFParser()
    pdf_parser.parse(r'files/Test3.pdf')
    print('pdf parser', pdf_parser.get_processed_stems(),
          len(pdf_parser.get_processed_stems()))
Beispiel #8
0
 def test_PDFParser(self):
     pdf = PDFParser()
     pdf.parse('files/Test.pdf')
     text = []
     assert pdf.get_processed_stems() == text