def getPDFInfoForTestString(self, filename): fp = open(filename, 'rb') codec = 'utf-8' laparams = LAParams() parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') if not doc.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = TagExtractor2Memory(rsrcmgr, codec=codec) interpreter = PDFPageInterpreter(rsrcmgr, device) #outfp.write(filename[-11:-4]+"\n") #print filename[-11:-4]+"\n"#uncomment for testing PDFInfo='' for i,page in enumerate(doc.get_pages()): PDFInfo+=interpreter.process_page_to_mem(page) if i==10: return PDFInfo