def pdf_to_html(path): manager = PDFResourceManager() retstr = BytesIO() layout = LAParams(all_texts=True) device = HTMLConverter(manager, retstr, laparams=layout) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() filepath.close() device.close() retstr.close() # Write HTML String to file.html # f = open("demofile3.html", "wb") # f.write(text) # f.close() font_size = extract_font_table(text) return font_size
def convertPDF(fname, pages=None): if not pages: pagenos = set() else: pagenos = set(pages) caching = True outfp = StringIO() layoutmode = 'normal' laparams = LAParams() rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1, layoutmode=layoutmode, laparams=laparams, imagewriter=None) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() text = outfp.getvalue() outfp.close() return text
def convert_pdf_to_html(self): """ Converts the pdf that is currently stored in the temporary file inside the repository to an html object """ rsrcmgr = PDFResourceManager() # Magic (simply functional) retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(self.path + 'parliament/repository/temp.pdf', 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() self.text = retstr.getvalue() retstr.close()
def pdf_to_text(path): manager = PDFResourceManager() retstr = BytesIO() layout = LAParams(all_texts=True) # device = TextConverter(manager, retstr, laparams=layout) layoutmode = 'normal' imagewriter = None device = HTMLConverter(manager, retstr, layoutmode=layoutmode, laparams=layout, imagewriter=imagewriter) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() filepath.close() device.close() retstr.close() # print text from BeautifulSoup import BeautifulSoup parsed_html = BeautifulSoup(text) return clean_string(parsed_html.text)
def convert_pdf_to_html(self,fname,pages=None,skip_first=True) : if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') print('Processing Page # :',end=' ') for i,page in enumerate(PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True)): if skip_first : if i in [0,1] : continue print(i,end=',') interpreter.process_page(page) convertedPDF = output.getvalue() infile.close(); converter.close(); output.close() return convertedPDF
def extract_pdf_page(filename): input_file_name = Path(filename).stem # Paths for creating folder and file output_file_folder = Path(HTML_PATH, input_file_name) output_file_folder.mkdir(parents=True, exist_ok=True) output_file_path = Path(output_file_folder, input_file_name + ".html") output_file = io.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = HTMLConverter(rsrcmgr, output_file, laparams=laparams) # EXTRACTING TEXT TO HTML with open(filename, 'rb') as fh: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): interpreter.process_page(page) device.close() html = output_file.getvalue() with open (output_file_path, 'w', encoding="utf-8" ) as fd: fd.write(html) output_file.close() return html
def pdf_to_html(scraped_pdf_data): from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import HTMLConverter from pdfminer.layout import LAParams import StringIO fp = StringIO.StringIO() fp.write(scraped_pdf_data) fp.seek(0) outfp = StringIO.StringIO() layoutmode='normal' scale=2 charmargin=0.5 linemargin=0.5 wordmargin=0.3 boxesflow=0 rsrcmgr = PDFResourceManager() device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) process_pdf(rsrcmgr, device, fp) device.close() t = outfp.getvalue() outfp.close() fp.close() return t
def pdfTotxt(filepath, outpath): try: fp = file(filepath, 'rb') outfp = file(outpath, 'w') # 创建一个PDF资源管理器对象来存储共享资源 # caching = False不缓存 rsrcmgr = PDFResourceManager(caching=False) # 创建一个PDF设备对象 laparams = LAParams() device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, imagewriter=None) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password='', caching=False, check_extractable=True): page.rotate = page.rotate % 360 interpreter.process_page(page) # 关闭输入流 fp.close() # 关闭输出流 device.close() outfp.flush() outfp.close() except Exception, e: print "Exception:%s", e
def html(self): html = None if os.path.isfile(self.__filename): output_file = 'cache/html/' + str(uuid.uuid4()) + '.html' if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) codec = 'utf-8' maxpages = 0 pagenos = None html = True outfp = open(output_file, 'wb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, layoutmode='normal', text_colors={}) fp = open(self.__filename, 'rb') # noinspection PyBroadException try: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages): interpreter.process_page(page) except: pass fp.close() device.close() outfp.flush() outfp.close() if os.path.isfile(output_file): file = open(output_file, "r", encoding='utf-8') html = file.read() return html
def parse_html(file_name): # input option password = '' pagenos = set() maxpages = 0 # output option imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = TextReciver() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) for fname in [file_name]: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() return outfp.text
def process_pdf(in_path, out_path): """ Processes a PDF and extracts its contents to HTML. Args: in_path: The full path to the source PDF file. out_path: The full path to the destination HTML file. """ page_numbers=set() # Get source/destination file handles in_file = file(in_path, 'rb') out_file = file(out_path, 'w') # Set up the resource manager, device, and interpreter res_mgr = PDFResourceManager() device = HTMLConverter(res_mgr, out_file, codec='utf-8', laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(res_mgr, device) for page in PDFPage.get_pages(in_file, page_numbers, maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) # Close all the file handles in_file.close() device.close() out_file.close() return
def convert_pdf_to_html(path): rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' #laparams = LAParams() laparams = LAParams(char_margin=3.5, all_texts=True) device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 # is for all caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def convert_pdf_to_html(path): rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() # out = open(path[:-4]+".html", 'w') # out.write(string) return str(string)
def pdftohtml(page): output = BytesIO() manager = PDFResourceManager() class imagewriter(object): @staticmethod def export_image(img): if img.stream: fstream = img.stream.get_rawdata() else: return "undefined" imhash = md5(fstream).hexdigest() imgobj = db.get_imgbyhash(imhash) if imgobj is not "undefined": return imgobj["tabname"] + "." + str(imgobj["id"]) else: return "undefined" converter = HTMLConverter(manager ,output ,laparams=LAParams() ,imagewriter=imagewriter) interpreter = PDFPageInterpreter(manager, converter) interpreter.process_page(page) converter.close() text = output.getvalue().decode("utf-8") output.close() return text
def read_pages(self, path, html=False, laparams=None, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, strip_control=False, debug=False, disable_caching=False, **kwargs): rsrcmgr = PDFResourceManager(caching=True) pages = [] with open(path, "rb") as f: for page in PDFPage.get_pages(f, None, maxpages=0, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 text = StringIO() if html: device = HTMLConverter(rsrcmgr, text, codec=None, scale=scale, layoutmode=layoutmode, laparams=laparams) else: device = TextConverter(rsrcmgr, text, codec=None, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) pages.append(text.getvalue()) device.close() return pages
def readPDF(pdfFile): rsrcmgr = PDFResourceManager() #retstr = StringIO() codec = 'utf-8' laparams = LAParams() #device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) output = BytesIO() print("stage1") converter = HTMLConverter(rsrcmgr, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, converter) print("stage2") password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) converter.close() print("stage3") #textstr = retstr.getvalue() convertedPDF = output.getvalue() print("stage4") #retstr.close() output.close() #device.close() return convertedPDF
def pdf_para_html(self, path): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter # from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO # import re # import csv rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def pdf_to_text(path): manager = PDFResourceManager(caching=True) retstr = BytesIO() laparams = LAParams() device = HTMLConverter(manager, retstr, laparams=laparams) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, set(), maxpages=0, caching=True, check_extractable=True): interpreter.process_page(page) device.close() text = retstr.getvalue() filepath.close() retstr.close() text_file = open("Output.txt", "w") text_file.write(str(text)) text_file.close() return text
def convertPDFToHTMLPage(bookPath): rsrcmgr = PDFResourceManager() codec = 'utf-8' scale = 1 rotation = 0 outfile = bookPath.replace('.pdf', '.html') outfp = file(outfile, 'w') laparams = LAParams() layoutmode = 'normal' device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams) fp = file(bookPath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, password="", check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print "HTML output written to : ", outfile
def convert(fp): showpageno = True pagenos = set() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=False) retstr = StringIO2() retstr.encoding = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, scale=1, layoutmode='normal', laparams=laparams, outdir=None, debug=False) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=0, password='', caching=False, check_extractable=True) device.close() return retstr.getvalue()
def convertPDF(fname, pages=None): if not pages: pagenos = set() else: pagenos = set(pages) caching = True outfp = StringIO() layoutmode = 'normal' laparams = LAParams() rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1, layoutmode=layoutmode, laparams=laparams, imagewriter=None) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() text = outfp.getvalue() outfp.close() return text
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def convert_pdf_to_html(input, output="temp.html"): """ :param input: PDF File to be converted, :param output: output filename, default is temp.html :return: doesn't return anything """ debug = 0 password = b'' pagenos = set() maxpages = 0 imagewriter = None rotation = 0 layoutmode = 'normal' encoding = 'utf-8' scale = 1 caching = True laparams = LAParams() # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) outfile = output if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) with open(input, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def lerPDF(arquivo): recursos = PDFResourceManager() buffer = StringIO() layoutParams = LAParams() disp = HTMLConverter(recursos, buffer, laparams=layoutParams) process_pdf(recursos, disp, arquivo) disp.close() conteudo = buffer.getvalue() buffer.close() return conteudo
def to_html(self, fp): out_buf = StringIO.StringIO() device = HTMLConverter(self.resmgr, out_buf, codec=self.options.codec, scale=self.options.scale, layoutmode=self.options.layoutmode, laparams=self.options.laparams, outdir=None) self._process(fp, device) device.close() result = out_buf.getvalue() out_buf.close() return result
def to_html(self, fp): out_buf = StringIO.StringIO() device = HTMLConverter( self.resmgr , out_buf , codec=self.options.codec , scale=self.options.scale , layoutmode=self.options.layoutmode , laparams=self.options.laparams , outdir=None ) self._process(fp, device) device.close() result = out_buf.getvalue() out_buf.close() return result
def transform_file(self, pdfpath): try: self.LOGGER.debug(pdfpath) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=self.laparams) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() # NOTE check_extractable seems to allow overriding text extraction locks for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False): interpreter.process_page(page) fp.close() device.close() html = retstr.getvalue() # otherwise html is str at this point, not unicode html = html.decode('utf8') retstr.close() soup = BeautifulSoup(html) # LOGGER.debug(soup.text) text_size = len(soup.text) stub_data = { # "URL": uri, "markup": { "innerHTML": unicode(html), "innerText": unicode(soup.text) }, "workflow": { "is_stub": True }, "__text_size": text_size, # __fields are ignored by kibana "timestamp": datetime.now() } except Exception as e: stub_data = { "error": str(e), "workflow": { "is_stub": True }, "__text_size": -1 } return stub_data
def convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def extract_price_from_pdf(file_name): pagenos = set() imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = TextReciver() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) #Read the file for fname in [file_name]: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, caching=True, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() #Find all lines that end with a price and include position #information. Also find all following lines that include prices #but no new location (shorter 100 characters) matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )' '(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*', outfp.text) pos_list = [] for m in matches: line_group = m.group().split('\n') #Extract the position information from the string pos_string = re.findall('(.*top:)([0-9]+)(px)', line_group[0])[0] ypos = pos_string[1] #Iterate over all lines and extract the price. Increment the #position slightly for each new line for i, price_text in enumerate(line_group):n price = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}', price_text[::-1])[0][::-1]) ypos= int(ypos) + i pos_list.append((ypos, price))
def get_html_agenda_pdfminer(agendaloc): """Convert a PDF agenda to text and/or HTML using pdfminer. pdfminer doesn't give very clean output, so this is optional and the imports are only loaded the first time this function is called. Probably better: pdftohtml -c -s -i -noframes abc.pdf abc.html Returns bytes, not str. I think. """ try: fh = open(agendaloc, 'rb') except FileNotFoundError: response = requests.get(agendaloc, stream=True) # response.raw supposedly gives a file handle, # but it's not seekable and pdfminer needs to seek. # fh = response.raw fh = io.BytesIO(response.content) try: resource_manager = PDFResourceManager() except UnboundLocalError: from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import TextConverter, HTMLConverter resource_manager = PDFResourceManager() # The fake file object needs to be StringIO for TextConverter, # BytesIO for HTMLConverter. # fake_file_handle = io.StringIO() fake_file_handle = io.BytesIO() converter = HTMLConverter(resource_manager, fake_file_handle, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles fh.close() converter.close() fake_file_handle.close() return text
def extract_text_from_pdf(pdf_path): text = "" resource_manager = PDFResourceManager() fake_file_handle = io.BytesIO() converter = HTMLConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue().decode() converter.close() fake_file_handle.close() text = html_text.extract_text(text) file = open(pdf_path.replace('.pdf', '.txt'), 'w') file.write(text) file.close()
def extract_text_from_pdf(pdf_path): global path path = pdf_path resource_manager = PDFResourceManager() fake_file_handle = io.BytesIO() converter = HTMLConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() if text: return text
def pdf_to_html(path): manager = PDFResourceManager() retstr = BytesIO() layout = LAParams(all_texts=True) device = HTMLConverter(manager, retstr, laparams=layout) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() filepath.close() device.close() retstr.close() return text
def get_soup(codebook_path: str): resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = HTMLConverter(resource_manager, file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(codebook_path, 'rb') as file: for page in PDFPage.get_pages(file): page_interpreter.process_page(page) text = file_handle.getvalue() converter.close() file.close() if text: soup = BeautifulSoup(text, features='html.parser') return soup
def convert(i, doc_list, path1, path2): filePDF = doc_list[i] print("Working with doc: {}".format(filePDF)) fileHTML = filePDF.replace('pdf', 'html') pathin = path1 + filePDF pathout = path2 + fileHTML #Define parameters to the PDF device objet manager = PDFResourceManager() codec = 'utf-8' caching = True laparams = LAParams() pagenos = set() password = '' maxpages = 0 #Bytes IO used for XML and HTML conversions output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) #Create PDF interpreter object interpreter = PDFPageInterpreter(manager, converter) infile = open(pathin, 'rb') #Process each page contained in the document for page in PDFPage.get_pages(infile, pagenos, caching=caching, check_extractable=True): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close() converter.close() output.close() with open(pathout, "wb") as fileConverted: fileConverted.write(convertedPDF) fileConverted.close() print("Done with file: {} numbered: {}".format(fileHTML, i)) return
def convert_pdf_to_html(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def convert_pdf_to_txt(self): pdf_folder = Path(self.path) # print(self.allfiles) list_txt = list() for files in self.allfiles: rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) print(files) fp = open(pdf_folder / files, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() # text = files for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() text = text.decode('utf-8') print("Longueur du doc : ") print(len(text)) print("Début du doc : ") print(text[1:100]) list_txt.append(text) print(len(list_txt)) fp.close() device.close() retstr.close() return list_txt
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = HTMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def parse(in_stream, out_stream): debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) outfp = io.open(out_stream, 'wt', encoding=codec, errors='ignore') device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) fp = io.open(in_stream, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return out_stream
def convert_pdf_to_html(path, save=True): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFSyntaxError # works with PDFMiner version 20140328 rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 # use 0 to ensure all pages are processed caching = True pagenos=set() pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) pagecount = 0 try: for page in pages: pagecount += 1 interpreter.process_page(page) except PDFSyntaxError: print "Invalid PDF", path fp.close() return fp.close() device.close() text = retstr.getvalue() retstr.close() if pagecount < 2: print "No content!", path return elif save: savepath = get_html_path(path) open(savepath,"w").write(text) return savepath else: return text
def extract_price_from_pdf(file_name): # input option password = '' pagenos = set() maxpages = 0 # output option imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = TextReciver() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) for fname in [file_name]: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',outfp.text) pos_list = [] for m in matches: line_group = m.group().split('\n') ypos = re.findall('[0-9]+',re.findall('.*top:[0-9]+px', line_group[0])[0][::-1])[0][::-1] for i,price in enumerate(line_group): if len(price): p = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',price[::-1])[0][::-1]) ypos= int(ypos) + i pos_list.append((ypos, p)) pos_list.sort() pos, price_list = zip(*pos_list) return price_list
def get_html(self, path): # Pulls html from PDF instead of plain text if path[-4:] != ".pdf": path = path + ".pdf" rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() result = retstr.getvalue() retstr.close() return result
def convert_pdf_to_html(url): r = requests.head(url) r.headers["content-type"] if 'application/pdf' in r.headers["content-type"]: r = requests.get(url) # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(r.content) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos=set() for page in PDFPage.create_pages(document): interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str
class PDF2Txt: def __init__(self,pdffile,outfile,output_type='text'): PDFDocument.debug = 0 PDFParser.debug = 0 CMapDB.debug = 0 PDFResourceManager.debug = 0 PDFPageInterpreter.debug = 0 PDFDevice.debug = 0 self.rsrcmgr = PDFResourceManager(caching=True) self.outtype = output_type self.outfile = outfile self.pdffile = pdffile def convert(self): outfp = file(self.outfile,'w') if self.outtype == 'text': self.device = TextConverter(self.rsrcmgr,outfp,codec='utf-8',laparams=LAParams(),imagewriter=None) elif self.outtype == 'xml': self.device = XMLConverter(self.rsrcmgr, outfp, codec='utf-8', laparams=LAParams(), imagewriter=None) elif self.outtype == 'html': self.device = HTMLConverter(self.rsrcmgr, outfp, codec='utf-8', scale=1, layoutmode='normal', laparams=LAParams(), imagewriter=None) else: print 'Formato de salida no soportado' sys.exit(-1) fp = file(self.pdffile,'rb') interpreter = PDFPageInterpreter(self.rsrcmgr,self.device) pagenos = set() for page in PDFPage.get_pages(fp,pagenos,caching=True,check_extractable=True): page.rotate = (page.rotate) % 360 interpreter.process_page(page) fp.close() self.device.close() outfp.close() print "Archivo %s creado en base a %s" % (self.outfile,self.pdffile)
class PDFHandler(object): '''A PDF Handle class to read contains Now also support file object/StringIO object(won't close after process)''' def __init__(self): # debug option self.setdebug(0) #only first page self.pagenos=set([0]) self.pageno = 1 self.outfp = stdmodel() self.codec = 'utf-8' self.showpageno = True self.scale = 1 self.password = '' self.maxpages = 0 self.rotation = 0 self.imagewriter = None self.laparams = LAParams() self.layoutmode = 'normal' # ResourceManager facilitates reuse of shared resources such as fonts and images so that # large objects are not allocated multiple times. #### This will cause some problem when set to default True. self.caching = False self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter) def reset(self,html=False): '''Reset can avoid wrong judge''' self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file if (html): self.htmldevice.close() self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter) else: self.device.close() self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) def setdebug(self,value): '''Set Debug Information. Especially when init''' # debug option self.debug = 0 PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug #PDFDocument.debug = self.debug #PDFParser.debug = self.debug #CMapDB.debug = self.debug #PDFDevice.debug = self.debug def GetPageNumber(self,fname,fobj=None): '''Get total page number of PDF''' if (fobj): #fp=StringIO(fobj.read()) #fobj.seek(0) fp=fobj else: fp = file(fname, 'rb') try: pageno=0 for page in PDFPage.get_pages(fp, set(), maxpages=0, password=self.password, caching=self.caching, check_extractable=False): pageno+=1 if fobj: fp.seek(0) else: fp.close() return pageno except Exception as e: print e if fobj: fp.seek(0) else: fp.close() print "Error Reading PDF page number.." return 0 def FastCheck(self,fname,fobj=None): '''Fast check whether has page one''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, set([0]), maxpages=1, password=self.password, caching=self.caching, check_extractable=False): break if fobj: fp.seek(0) else: fp.close() return True except Exception as e: if fobj: fp.seek(0) else: fp.close() print "Error Reading PDF page number..",fname return False def GetSinglePage(self,fname,pageno=1,html=False,fobj=None): '''Get Single Page contents of PDF, return string Default first page''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: if (html): interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice) else: interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in PDFPage.get_pages(fp, set([pageno-1]), maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=False): page.rotate = (page.rotate+self.rotation) % 360 interpreter.process_page(page) if fobj: fp.seek(0) else: fp.close() outstr=self.outfp.get() self.outfp.reset() return outstr except Exception as e: self.outfp.reset() if fobj: fp.seek(0) else: fp.close() return "" def GetPages(self,fname,pagenos=[1],html=False,fobj=None): '''Get Several Page contents of PDF, return string Default first page''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: if (html): interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice) else: interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in PDFPage.get_pages(fp, set([i-1 for i in pagenos]), maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=False): page.rotate = (page.rotate+self.rotation) % 360 interpreter.process_page(page) if fobj: fp.seek(0) else: fp.close() outstr=self.outfp.get() self.outfp.reset() return outstr except Exception as e: self.outfp.reset() if fobj: fp.seek(0) else: fp.close() return "" def GetAllPages(self,fname,html=False,fobj=None): '''Get All Page contents of PDF, return string''' if (fobj): fp=fobj else: fp = file(fname, 'rb') try: if (html): interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice) else: interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in PDFPage.get_pages(fp, set(), maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=False): page.rotate = (page.rotate+self.rotation) % 360 interpreter.process_page(page) if fobj: fp.seek(0) else: fp.close() outstr=self.outfp.get() self.outfp.reset() return outstr except Exception as e: self.outfp.reset() if fobj: fp.seek(0) else: fp.close() return ""
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content
def pdf_gettext(filepath, reserve): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = 'output.html' imagewriter = None layoutmode = 'normal' codec = 'utf-8' scale = 1 caching = True laparams = LAParams() firstout = 'firstout.html' lastout = 'lastout.html' firstpage = None lastpage = None first = [] last = [] if False: firstout = filepath[:-3] + firstout lastout = filepath[:-3] + lastout if os.path.exists(firstout): html_textparser(firstout, first) if os.path.exists(lastout): html_textparser(lastout, last) return first, last rsrcmgr = PDFResourceManager(caching=caching) #import io #outfp = io.StringIO() #outfp = io.open(outfile, 'w+t', encoding=codec, errors='ignore') import tempfile outfp = tempfile.TemporaryFile(mode='w+t', encoding=codec) device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams) with open(filepath, 'rb') as fp: #process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, # caching=caching, check_extractable=True) parser = PDFParser(fp) doc = PDFDocument(caching=caching) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if not doc.is_extractable: raise Exception('Text extraction is not allowed: %s' % filepath) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): if not firstpage: firstpage = page else: lastpage = page if firstpage: interpreter.process_page(firstpage) with open(firstout, 'w', encoding=codec) as f: outfp.seek(0) f.write(outfp.read()) html_textparser(firstout, first) if lastpage: outfp.truncate(0) interpreter.process_page(lastpage) with open(lastout, 'w', encoding=codec) as f: outfp.seek(0) f.write(outfp.read()) html_textparser(lastout, last) device.close() outfp.close() return first, last