def convert(self, data): # convert binary pdf data into a file like structure pdfdata = StringIO(data) # I have no idea why this is needed CMapDB.initialize('CMap', 'CDBCMap') # create the converter and resource manager rsrc = PDFResourceManager() converter = TextConverter(rsrc) # setup the parser doc = PDFDocument() parser = PDFParser(doc, pdfdata) # initialize the pdf try: # use empty password doc.initialize('') except PDFPasswordIncorrect: return '' # check if we can extract the contents of this file if not doc.is_extractable: return '' # do the conversion interpreter = PDFPageInterpreter(rsrc, converter) for page in doc.get_pages(): interpreter.process_page(page) converter.close() pdfdata.close() return converter.get_text()
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) try: doc.initialize(password) except PDFPasswordIncorrect: raise TextExtractionNotAllowed('Incorrect password') if not doc.is_extractable: raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device) for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno+1: break device.close() fp.close() return
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) try: doc.initialize(password) except PDFPasswordIncorrect: raise TextExtractionNotAllowed('Incorrect password') if not doc.is_extractable: raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device) for (pageno, page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno + 1: break device.close() fp.close() return