def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', check_extractable=True): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if check_extractable and not doc.is_extractable: raise PDFTextExtractionNotAllowed( 'Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for (pageno, page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno + 1: break return
def pdf2txt(outfp, rsrc, fname, pages, codec, password='', debug=0): device = TextConverter(rsrc, debug=debug) doc = PDFDocument(debug=debug) fp = file(fname, 'rb') parser = PDFParser(doc, fp, debug=debug) try: doc.initialize(password) except PDFPasswordIncorrect: raise TextExtractionNotAllowed('incorrect password') if not doc.is_extractable: raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) outfp.write('<document>\n') for (i,page) in enumerate(doc.get_pages(debug=debug)): if pages and (i not in pages): continue device.reset() interpreter.process_page(page) device.dump(outfp, codec) fp.close() device.close() outfp.write('</document>\n') return
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''): doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) interpreter = PDFPageInterpreter(rsrc, device) for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno+1: break return
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', caching=True, check_extractable=True): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(caching=caching) # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if check_extractable and not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno+1: break return