def process_pdf(in_path, out_path): """ Processes a PDF and extracts its contents to HTML. Args: in_path: The full path to the source PDF file. out_path: The full path to the destination HTML file. """ page_numbers=set() # Get source/destination file handles in_file = file(in_path, 'rb') out_file = file(out_path, 'w') # Set up the resource manager, device, and interpreter res_mgr = PDFResourceManager() device = HTMLConverter(res_mgr, out_file, codec='utf-8', laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(res_mgr, device) for page in PDFPage.get_pages(in_file, page_numbers, maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) # Close all the file handles in_file.close() device.close() out_file.close() return
def pdf_to_html(scraped_pdf_data): from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import HTMLConverter from pdfminer.layout import LAParams import StringIO fp = StringIO.StringIO() fp.write(scraped_pdf_data) fp.seek(0) outfp = StringIO.StringIO() layoutmode='normal' scale=2 charmargin=0.5 linemargin=0.5 wordmargin=0.3 boxesflow=0 rsrcmgr = PDFResourceManager() device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) process_pdf(rsrcmgr, device, fp) device.close() t = outfp.getvalue() outfp.close() fp.close() return t
def pdf_para_html(self, path): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter # from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO # import re # import csv rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def convertPDF(fname, pages=None): if not pages: pagenos = set() else: pagenos = set(pages) caching = True outfp = StringIO() layoutmode = 'normal' laparams = LAParams() rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1, layoutmode=layoutmode, laparams=laparams, imagewriter=None) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() text = outfp.getvalue() outfp.close() return text
def parse_html(file_name): # input option password = '' pagenos = set() maxpages = 0 # output option imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = TextReciver() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) for fname in [file_name]: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() return outfp.text
def to_html(self, fp): out_buf = StringIO.StringIO() device = HTMLConverter( self.resmgr , out_buf , codec=self.options.codec , scale=self.options.scale , layoutmode=self.options.layoutmode , laparams=self.options.laparams , outdir=None ) self._process(fp, device) device.close() result = out_buf.getvalue() out_buf.close() return result
def transform_file(self, pdfpath): try: self.LOGGER.debug(pdfpath) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=self.laparams) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() # NOTE check_extractable seems to allow overriding text extraction locks for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False): interpreter.process_page(page) fp.close() device.close() html = retstr.getvalue() # otherwise html is str at this point, not unicode html = html.decode('utf8') retstr.close() soup = BeautifulSoup(html) # LOGGER.debug(soup.text) text_size = len(soup.text) stub_data = { # "URL": uri, "markup": { "innerHTML": unicode(html), "innerText": unicode(soup.text) }, "workflow": { "is_stub": True }, "__text_size": text_size, # __fields are ignored by kibana "timestamp": datetime.now() } except Exception as e: stub_data = { "error": str(e), "workflow": { "is_stub": True }, "__text_size": -1 } return stub_data
def __init__(self): # debug option self.setdebug(0) #only first page self.pagenos=set([0]) self.pageno = 1 self.outfp = stdmodel() self.codec = 'utf-8' self.showpageno = True self.scale = 1 self.password = '' self.maxpages = 0 self.rotation = 0 self.imagewriter = None self.laparams = LAParams() self.layoutmode = 'normal' # ResourceManager facilitates reuse of shared resources such as fonts and images so that # large objects are not allocated multiple times. #### This will cause some problem when set to default True. self.caching = False self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter)
def convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def extract_price_from_pdf(file_name): pagenos = set() imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = TextReciver() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) #Read the file for fname in [file_name]: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, caching=True, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() #Find all lines that end with a price and include position #information. Also find all following lines that include prices #but no new location (shorter 100 characters) matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )' '(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*', outfp.text) pos_list = [] for m in matches: line_group = m.group().split('\n') #Extract the position information from the string pos_string = re.findall('(.*top:)([0-9]+)(px)', line_group[0])[0] ypos = pos_string[1] #Iterate over all lines and extract the price. Increment the #position slightly for each new line for i, price_text in enumerate(line_group):n price = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}', price_text[::-1])[0][::-1]) ypos= int(ypos) + i pos_list.append((ypos, price))
def convert_pdf_to_html(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def extract_price_from_pdf(file_name): # input option password = '' pagenos = set() maxpages = 0 # output option imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = TextReciver() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) for fname in [file_name]: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',outfp.text) pos_list = [] for m in matches: line_group = m.group().split('\n') ypos = re.findall('[0-9]+',re.findall('.*top:[0-9]+px', line_group[0])[0][::-1])[0][::-1] for i,price in enumerate(line_group): if len(price): p = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',price[::-1])[0][::-1]) ypos= int(ypos) + i pos_list.append((ypos, p)) pos_list.sort() pos, price_list = zip(*pos_list) return price_list
def get_html(self, path): # Pulls html from PDF instead of plain text if path[-4:] != ".pdf": path = path + ".pdf" rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() result = retstr.getvalue() retstr.close() return result
def convert_pdf_to_html(url): r = requests.head(url) r.headers["content-type"] if 'application/pdf' in r.headers["content-type"]: r = requests.get(url) # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(r.content) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos=set() for page in PDFPage.create_pages(document): interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str
class PDF2Txt: def __init__(self,pdffile,outfile,output_type='text'): PDFDocument.debug = 0 PDFParser.debug = 0 CMapDB.debug = 0 PDFResourceManager.debug = 0 PDFPageInterpreter.debug = 0 PDFDevice.debug = 0 self.rsrcmgr = PDFResourceManager(caching=True) self.outtype = output_type self.outfile = outfile self.pdffile = pdffile def convert(self): outfp = file(self.outfile,'w') if self.outtype == 'text': self.device = TextConverter(self.rsrcmgr,outfp,codec='utf-8',laparams=LAParams(),imagewriter=None) elif self.outtype == 'xml': self.device = XMLConverter(self.rsrcmgr, outfp, codec='utf-8', laparams=LAParams(), imagewriter=None) elif self.outtype == 'html': self.device = HTMLConverter(self.rsrcmgr, outfp, codec='utf-8', scale=1, layoutmode='normal', laparams=LAParams(), imagewriter=None) else: print 'Formato de salida no soportado' sys.exit(-1) fp = file(self.pdffile,'rb') interpreter = PDFPageInterpreter(self.rsrcmgr,self.device) pagenos = set() for page in PDFPage.get_pages(fp,pagenos,caching=True,check_extractable=True): page.rotate = (page.rotate) % 360 interpreter.process_page(page) fp.close() self.device.close() outfp.close() print "Archivo %s creado en base a %s" % (self.outfile,self.pdffile)
def reset(self,html=False): '''Reset can avoid wrong judge''' self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file if (html): self.htmldevice.close() self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter) else: self.device.close() self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter)
def translate(output, args): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() outfile = output # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(args, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
class PDFHandler(object): '''A PDF Handle class to read contains Now also support file object/StringIO object(won't close after process)''' def __init__(self): # debug option self.setdebug(0) #only first page self.pagenos=set([0]) self.pageno = 1 self.outfp = stdmodel() self.codec = 'utf-8' self.showpageno = True self.scale = 1 self.password = '' self.maxpages = 0 self.rotation = 0 self.imagewriter = None self.laparams = LAParams() self.layoutmode = 'normal' # ResourceManager facilitates reuse of shared resources such as fonts and images so that # large objects are not allocated multiple times. #### This will cause some problem when set to default True. self.caching = False self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter) def reset(self,html=False): '''Reset can avoid wrong judge''' self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file if (html): self.htmldevice.close() self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter) else: self.device.close() self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) def setdebug(self,value): '''Set Debug Information. Especially when init''' # debug option self.debug = 0 PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug #PDFDocument.debug = self.debug #PDFParser.debug = self.debug #CMapDB.debug = self.debug #PDFDevice.debug = self.debug def GetPageNumber(self,fname,fobj=None): '''Get total page number of PDF''' if (fobj): #fp=StringIO(fobj.read()) #fobj.seek(0) fp=fobj else: fp = file(fname, 'rb') try: pageno=0 for page in PDFPage.get_pages(fp, set(), maxpages=0, password=self.password, caching=self.caching, check_extractable=False): pageno+=1 if fobj: fp.seek(0) else: fp.close() return pageno except Exception as e: print e if fobj: fp.seek(0) else: fp.close() print "Error Reading PDF page number.." return 0 def FastCheck(self,fname,fobj=None): '''Fast check whether has page one''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, set([0]), maxpages=1, password=self.password, caching=self.caching, check_extractable=False): break if fobj: fp.seek(0) else: fp.close() return True except Exception as e: if fobj: fp.seek(0) else: fp.close() print "Error Reading PDF page number..",fname return False def GetSinglePage(self,fname,pageno=1,html=False,fobj=None): '''Get Single Page contents of PDF, return string Default first page''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: if (html): interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice) else: interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in PDFPage.get_pages(fp, set([pageno-1]), maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=False): page.rotate = (page.rotate+self.rotation) % 360 interpreter.process_page(page) if fobj: fp.seek(0) else: fp.close() outstr=self.outfp.get() self.outfp.reset() return outstr except Exception as e: self.outfp.reset() if fobj: fp.seek(0) else: fp.close() return "" def GetPages(self,fname,pagenos=[1],html=False,fobj=None): '''Get Several Page contents of PDF, return string Default first page''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: if (html): interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice) else: interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in PDFPage.get_pages(fp, set([i-1 for i in pagenos]), maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=False): page.rotate = (page.rotate+self.rotation) % 360 interpreter.process_page(page) if fobj: fp.seek(0) else: fp.close() outstr=self.outfp.get() self.outfp.reset() return outstr except Exception as e: self.outfp.reset() if fobj: fp.seek(0) else: fp.close() return "" def GetAllPages(self,fname,html=False,fobj=None): '''Get All Page contents of PDF, return string''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: if (html): interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice) else: interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in PDFPage.get_pages(fp, set(), maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=False): page.rotate = (page.rotate+self.rotation) % 360 interpreter.process_page(page) if fobj: fp.seek(0) else: fp.close() outstr=self.outfp.get() self.outfp.reset() return outstr except Exception as e: self.outfp.reset() if fobj: fp.seek(0) else: fp.close() return ""
def pdf_gettext(filepath, reserve): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = 'output.html' imagewriter = None layoutmode = 'normal' codec = 'utf-8' scale = 1 caching = True laparams = LAParams() firstout = 'firstout.html' lastout = 'lastout.html' firstpage = None lastpage = None first = [] last = [] if False: firstout = filepath[:-3] + firstout lastout = filepath[:-3] + lastout if os.path.exists(firstout): html_textparser(firstout, first) if os.path.exists(lastout): html_textparser(lastout, last) return first, last rsrcmgr = PDFResourceManager(caching=caching) #import io #outfp = io.StringIO() #outfp = io.open(outfile, 'w+t', encoding=codec, errors='ignore') import tempfile outfp = tempfile.TemporaryFile(mode='w+t', encoding=codec) device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams) with open(filepath, 'rb') as fp: #process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, # caching=caching, check_extractable=True) parser = PDFParser(fp) doc = PDFDocument(caching=caching) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if not doc.is_extractable: raise Exception('Text extraction is not allowed: %s' % filepath) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): if not firstpage: firstpage = page else: lastpage = page if firstpage: interpreter.process_page(firstpage) with open(firstout, 'w', encoding=codec) as f: outfp.seek(0) f.write(outfp.read()) html_textparser(firstout, first) if lastpage: outfp.truncate(0) interpreter.process_page(lastpage) with open(lastout, 'w', encoding=codec) as f: outfp.seek(0) f.write(outfp.read()) html_textparser(lastout, last) device.close() outfp.close() return first, last
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content
def convertPDF(outfile,pdfFile): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option #outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() """ for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) #""" PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: pass #return usage() fname = pdfFile #for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
import sys, getopt #converts pdf, returns its text content as a string def convert(case,fname, pages=None): if not pages: pagenums = set(); else: pagenums = set(pages); manager = PDFResourceManager() codec = 'utf-8' caching = True if case == 'text' : output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) if case == 'HTML' : output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close(); converter.close(); output.close() return convertedPDF def convert_pdf_to_txt(path_to_file): rsrcmgr = PDFResourceManager() retstr = StringIO()
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-t text|html|sgml|tag] [-o output] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # path option cmapdir = find_cmap_path() # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-C': cmapdir = v elif k == '-P': password = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-o': outfile = v elif k == '-s': scale = float(v) elif k == '-D': laparams.direction = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # CMapDB.initialize(cmapdir) rsrc = PDFResourceManager() if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.sgml'): outtype = 'sgml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'sgml': device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() return
def ConvertPdf(pdfpath, outfp, opts={}): import sys from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() return True
def decode_pdf(filename): global current_section global pre_section global pre_font_family global pre_font_size global title global authors global abstract global keywords current_section = "" pre_section = TAG_BEGIN pre_font_family = "" pre_font_size = "" title = "" authors = set() abstract = "" keywords = "" path = basedir + "/static/demos/paperminer/papers/" + filename # layout parameters laparams = LAParams() caching = True rsrcmgr = PDFResourceManager(caching=caching) outtype = 'html' out = StringIO() # Opens a file for reading only in binary format. The file pointer is # placed at the beginning of the file. This is the default mode. fp = file(path, 'rb') # parse PDF to HTML codec = 'utf-8' if outtype == 'text': device = TextConverter(rsrcmgr, out, codec=codec, laparams=laparams, imagewriter=None) if outtype == 'xml': device = XMLConverter(rsrcmgr, out, codec=codec, laparams=laparams, imagewriter=None) if outtype == 'html': device = HTMLConverter(rsrcmgr, out, codec=codec, laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 pagenos = set() # only process the first page max_page = 1 p = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=max_page, password=password, caching=caching, check_extractable=True): if p >= max_page: break interpreter.process_page(page) fp.close() device.close() # str_value is the first PDF page in HTML str_value = out.getvalue() out.close() # loop through each line in HTML for line in str_value.split('<br>'): analyze(line) result = [ title.decode('utf-8'), authors, abstract.decode('utf-8'), keywords.decode('utf-8') ] return result
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str( count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html': 'htm', 'tag': 'tag', 'text': 'txt', 'xml': 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
for word in words: new_word = word.lower() new_words.append(new_word) return new_words def convert_to_html(case, fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True if case == 'HTML': output = io.BytesIO() converter = HTMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close() converter.close() output.close()
def convert(argv): def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = 'inputs/' + sys.argv[1].replace(' ', '')[:-4] + '.txt' outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file('pdfs/' + fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() #read converted file y = open("inputs/" + sys.argv[1].replace(' ', '')[:-4] + '.txt', "r") output = brl.translate(y.read()) #convert into Grade 2 Braille unicode x = brl.toUnicodeSymbols(output, flatten=True) #save to results folder in .txt format text_file = open( "results/" + sys.argv[1].replace(' ', '')[:-4] + "-Braille.txt", "w") text_file.write(x.encode(codec)) text_file.close()