def get_paper_content(fname, pages=2, outdir="data"): debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option basename = os.path.basename(fname) basename = basename.replace(".pdf", "") outfile = os.path.join(outdir, basename + ".html") outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.debug = True try: for index, page in enumerate( PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)): if index > pages: break page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: print(fname) return fp.close() device.close() outfp.close() return
from pdfminer.layout import LAParams from pdfminer.converter import XMLConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage laparams = LAParams() imagewriter = None codec = 'utf-8' outfp = sys.stdout stripcontrol = True pagenos = set() fname = sys.argv[1] rsrcmgr = PDFResourceManager(caching=True) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.debug = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close()
laparams = LAParams() imagewriter = None codec = 'utf-8' outfp = sys.stdout stripcontrol = True pagenos = set() fname = sys.argv[1] rsrcmgr = PDFResourceManager(caching=True) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.debug = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close()