def convert(url, pages=None): assert isinstance(url, basestring) assert pages == None or isinstance(pages, list) rscmng = PDFResourceManager() retstr = StringIO() device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams()) web_page = urllib2.urlopen(urllib2.Request(url)) fp = StringIO(web_page.read()) interpreter = PDFPageInterpreter(rscmng, device) pdf_pages = PDFPage.get_pages( fp, set(pages if pages != None else []), maxpages=0, password='', caching=True, check_extractable=True ) for page in pdf_pages: interpreter.process_page(page) result = retstr.getvalue() fp.close() web_page.close() device.close() retstr.close() return result
def run(path): print "Calling parser :%s" % path t0 = time.clock() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() book = Book() i = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page_tmp = Page() begin_page = len(retstr.getvalue()) interpreter.process_page(page) page_tmp.text = retstr.getvalue()[begin_page:-1] book.pages.append(page_tmp) fp.close() device.close() retstr.close() print "Parsing in:", time.clock() - t0 return book
def Parse(self): # 先看是否有 cache,以及日期是否夠新 if not os.path.exists(parseCacheDir): os.makedirs(parseCacheDir) cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache') foundCache = (os.path.isfile(cacheFile) and \ os.path.getsize(cacheFile) > 0 and \ os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName)) if (foundCache): fp = open(cacheFile, 'rb') self.RawData = pickle.load(fp) fp.close() else: fp = open(self.pdfFileName, 'rb') for page in PDFPage.get_pages(fp, None, maxpages=1): rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) layout = device.get_result() self.__readobj(layout._objs) for category in self.RawData.values(): self.__reverseYaxis(category, layout.bbox[3]) cacheFp = open(cacheFile, 'wb') pickle.dump(self.RawData, cacheFp) cacheFp.close() fp.close() self.__calculateBoundary() self.__assignCharsAndLinesToCell() self.__processCells() return (self.effectiveFrom, self.__getResult())
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def pdf_to_text(pdfname): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO # PDFMiner boilerplate rsrcmgr = PDFResourceManager() sio = StringIO() # codec = 'utf-8' codec = 'ascii' laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = file(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = sio.getvalue() # Cleanup device.close() sio.close() return text
def pdfconvert(infullpath, file, outfullpath, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = os.path.splitext(outfullpath)[0] + '.jpg' txtfile = os.path.splitext(outfullpath)[0] + '.txt' string.replace(txtfile, ' ', '_') string.replace(txtfile, '(', '_') string.replace(txtfile, ')', '_') text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string)
def pdf2xml(infile): ''' Return a string of XML representation for given PDF file handle. Uses pdfminer to do the conversion and does some final post-processing. ''' outfile = StringIO() # Empirically determined... laparams = LAParams() laparams.char_margin = 0.4 # See pdf2txt.py rsrcmgr = PDFResourceManager(caching=False) device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) if page_api: for page in PDFPage.get_pages(infile, set()): interpreter.process_page(page) else: process_pdf(rsrcmgr, device, infile, set()) infile.close() return outfile.getvalue().replace("\n", "")
def extract_text_from_pdf(pdf_filename): """ Function to extract the text from pdf documents using pdfminer Parameters: ----------- pdf_filename -- string File name of the pdf document as string Returns: -------- extracted_text -- string Text extracted from pdf as string """ resource_manager = PDFResourceManager() return_string = StringIO() la_params = LAParams() device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params) fp = file(pdf_filename, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) page_nos = set() for page in PDFPage.get_pages(fp, page_nos): interpreter.process_page(page) fp.close() device.close() extracted_text = return_string.getvalue() return_string.close() return extracted_text
def extract_text(self): pdf_data = file(self.local_file, 'rb').read() pdf_stream = io.BytesIO(pdf_data) laparams = LAParams() resource_manager = PDFResourceManager(caching=True) output_type = 'text' codec = 'utf-8' output_stream = io.BytesIO() pagenos = set() device = TextConverter( resource_manager, output_stream, codec=codec, laparams=laparams, ) interpreter = PDFPageInterpreter( resource_manager, device, ) pages = PDFPage.get_pages( pdf_stream, pagenos, maxpages=0, caching=True, check_extractable=True, ) for page in pages: interpreter.process_page(page) self.text = output_stream.getvalue().decode('utf8')
def pdfconvert(infullpath, file, infolder, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') # print "pdffile=", pdffile for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = infolder + str(txtfilename) + '.jpg' txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt' text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string) return jpgfile
def pdf_from_url_to_txt(url, maxpages=0): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the url provided as an argument to the function and read the content f = urllib2.urlopen(urllib2.Request(url)).read() # Cast to StringIO object fp = StringIO(f) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def get_pdf_text(path): """ Reads a pdf file and returns a dict of the text where the index represents the page number. http://stackoverflow.com/a/20905381 """ rsrcmgr = PDFResourceManager() retstr = StringIO() # change to to utf-8 if the text comes out garbled codec = 'ascii' #codec = 'utf-8' laparams = LAParams() pages = {} device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() retstr.close() return pages
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 120 caching = True pagenos=set() # print "two" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) # print "one" try: fp.close() device.close() str = retstr.getvalue() retstr.close() except: str = retstr.getvalue() return str
def convert_pdf_to_txt(path): """ Converts PDF to text using the pdfminer library """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) file_handle = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() file_handle.close() device.close() retstr.close() return text
def convert_pdf_to_txt(path): ## TAKEN FROM STACK OVERFLOW ## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial ## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() # Read text from pages device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fp.close() device.close() retstr.close() return str
def pdf_read(pdf): """ Use PDFMiner to extract text from pdf file. <PDFMiner even though more low-level but pretty good tool to read pdfs> Args: *pdf* (str) -- path to pdf file Returns: *text* (str) -- a text extracted from pdf """ # initalizing objects res_manager = PDFResourceManager() strio = StringIO() lps = LAParams() device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps) interpreter = PDFPageInterpreter(res_manager, device) # opening a pdf file with 'rb' mode for reading binary files pdf_file = file(pdf, 'rb') for page in PDFPage.get_pages(pdf_file, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) # finishing up pdf_file.close() device.close() text = strio.getvalue() strio.close() return text
def get_layout(path): '''returns a list of every character in the document as well as its location''' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() layout = [] device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) layout.append( device.get_result() ) fp.close() device.close() retstr.close() return layout
def convert_pdf_to_txt(self, path): """ A very simple conversion function which returns text for parsing from PDF. path = The path to the file """ try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text except Exception as e: text = "" return text self.logger.error( "Failed to PDF to text: " + str(e))
def convert_pdf_to_txt(path, output): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() f = open(output, 'wb') f.write(text) f.close() return text
def pdf2txt(path): ''' Converts a given PDF to plain text in UTF8. ''' try: rsrcMgr = PDFResourceManager() retStr = StringIO() codec = 'utf-8' laParams = LAParams() device = TextConverter(rsrcMgr, retStr, codec=codec, laparams=laParams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcMgr, device) password = "" maxPages = 0 caching = True pageNos=set() for page in PDFPage.get_pages(fp,pageNos,maxpages=maxPages,password=password,caching=caching,check_extractable=True): interpreter.process_page(page) fp.close() device.close() text = retStr.getvalue() retStr.close() return text except: return None
def convert_pdf_to_txt(path): temp = os.path.splitext(path) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() outputFile = temp[0] + ".txt" print outputFile ff = open(outputFile, "w") ff.write(text) ff.close()
def convert_pdf_to_text(pdf_path): """ Given a path to a local PDF file, this function extracts text from it. """ process_id = os.getpid() resource_manager = PDFResourceManager() output = StringIO.StringIO() laparams = LAParams(detect_vertical=True) device = TextConverter( resource_manager, output, codec='utf-8', laparams=laparams ) interpreter = PDFPageInterpreter(resource_manager, device) file_handler = file(pdf_path, 'rb') pages = PDFPage.get_pages(file_handler) for idx, page in enumerate(pages): print("Page " + str(idx + 1), end='\r') sys.stdout.flush() interpreter.process_page(page) print() data = output.getvalue() data = data.replace('\n', ' ') data = data.replace('\t', ' ') data = data.replace('\r', ' ') data = data.replace('\x0c', ' ') return data
def pdf_to_text(pdf): pagenos = set() maxpages = 0 # output option rotation = 0 codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outtype = 'text' retstr = BytesIO() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = pdf if isinstance(pdf, str): fp = open(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() result = retstr.getvalue() print(result) return result
def extract_pdf_page(filename, page_number_or_numbers): """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those pages and return them as XML (in utf-8 bytes). The param page_number_or_numbers can be a single page number or an iterable thereof. """ # This code adapted from pdf2txt.py which is part of PDFMiner. # Here's the command line version of the code below -- # pdf2txt.py -p 1 -o expected.xml sample.pdf if is_iterable(page_number_or_numbers): page_numbers = page_number_or_numbers else: page_numbers = [page_number_or_numbers] f_out = StringIO.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams) with open(filename, 'rb') as f_in: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f_in, page_numbers): interpreter.process_page(page) device.close() xml = f_out.getvalue() f_out.close() return xml
def __convert(self, ifile, ofile=None): fp = file(ifile, 'rb') if ofile is None: outfp = StringIO.StringIO() else: outfp = file(ofile, 'wb') rsrcmgr = PDFResourceManager(caching=self.caching) device = TextConverter(rsrcmgr, outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) try: for page in PDFPage.get_pages(fp, self.pagenos, maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=True): page.rotate = (page.rotate + self.rotation) % 360 interpreter.process_page(page) except (PDFException, MemoryError) as e: print "Could not extract text {0}".format(e) fp.close() device.close() retval = None if ofile is None: retval = outfp.getvalue() outfp.close() return retval
def pdf_to_txt(fichero_pdf,fichero_txt): # Especificamos la configuracion de nuestro pdf password = '' pagenos = set() maxpages = 0 imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() # Estrablecemos el gestor rsrcmgr = PDFResourceManager(caching=caching) # Creamos el fichero de salida y lingamos el dispositivo que lo transforma outfp = file(fichero_txt, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # Para cada pagina del fichero pdf vamos interpretandola mediante el dispositivo fp = file(fichero_pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) # Cerramos los dispositivos abiertos fp.close() device.close() outfp.close() return 1
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): # a = page.contents[0].rawdata # print ('u', a) # print # splitData = a.split('\n') interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() # print ('u', text) # print # print(text) return text
def pdf2txt(self, lowerBorder=-1, upperBorder=-1): """ Returns the plain text of the document. If lowerBorder is an int number > -1, only page referring to this number will be returned. If lowerBorder and upperBorder are >-1 and upperBorder > lowerBoder, the pages referring to that range will be returned. """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(self.filename, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True if (lowerBorder==-1 and upperBorder==-1) or (lowerBorder>-1 and upperBorder=="max"): pagenos=set() elif lowerBorder > -1 and upperBorder==-1: #extract only a single page pagenos=set(range(lowerBorder, lowerBorder+1)) elif lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder: raise ValueError("illegal parameter passed") else: pagenos=set(range(lowerBorder, upperBorder+1)) for (pageno, page) in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)): if pageno < lowerBorder and upperBorder == "max": continue interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.decode('utf-8')
def pdf_to_txt(path, lowerBorder=-1, upperBorder=-1): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True if lowerBorder==-1 and upperBorder==-1: pagenos=set() else: if lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder: raise ValueError("illegal parameter passed") else: pagenos=set(range(lowerBorder, upperBorder+1)) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.decode('utf-8')
def convert_pdf_to_txt(path): """ This function converts a .pdf file to text @path: file path to .pdf document from: http://stackoverflow.com/questions/26494211/ extracting-text-from-a-pdf-file-using-pdfminer-in-python/26495057#26495057 """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def conv_pdf2txt(path, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manger = PDFResourceManager() codec = 'ascii' #or 'utf-8' device = TextConverter(manger, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manger, device) infile = file(path, 'rb') for page in PDFPage.get_pages(infile, pagenums, caching=True, check_extractable=True): interpreter.process_page(page) txt = output.getvalue() infile.close() device.close() output.close() return txt
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') try: for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) except: infile.close() raise ValueError('cannot convert pdf to text') infile.close() converter.close() text = output.getvalue() output.close return text
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() #device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def get_text(path): # возвращаемый список с текстом text_list = list() resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() if text: text_list = text.split('. ') return text_list
def get_blurb(): pdfs = glob.glob('/pdfs/*') if not pdfs: print >> sys.stderr, 'NO PDFS' return '', '' pdf = random.choice(pdfs) print >> sys.stderr, 'pdf:', pdf with open(pdf, 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser) assert document.is_extractable rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = list(PDFPage.get_pages(f)) pnum = random.randint(0, len(pages)) interpreter.process_page(pages[pnum]) txt = retstr.getvalue() return pdf.replace('pdfs', 'view') + '#page=' + str(pnum), txt[:100]
def convertPDFToText(): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open('test.pdf', 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def do_import(self, results, filepath): buff = StringIO() fp = open(filepath, 'rb') laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() page_num = 0 for page in PDFPage.get_pages(fp, pagenos, check_extractable=True): page_num += 1 device = TextConverter(rsrcmgr, buff, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) buff.write("\n") results.investigation.update(import_text=buff.getvalue()) fp.close() buff.close()
def convert_pdf(target_fn): ''' Convert a pdf file into a string of text ''' laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True resource_manager = PDFResourceManager(caching=True) output_fh = StringIO.StringIO() device = TextConverter(resource_manager, output_fh, codec='utf-8', laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(resource_manager, device) with open(target_fn, 'rb') as f: for page in PDFPage.get_pages(f): interpreter.process_page(page) device.close() output_fh.seek(0) content = output_fh.read().decode('utf-8') return content
def pdfparser(path: str) -> list: """ Parse pdf file to list contains content grouped in tuples (xcor, ycor, text) :param path: pdf file path :return: pdf content. list with tuples: (x cor, y cor, text) """ if not allowed_file(path): return 'Incorrect file.' fp = open(path, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) page_content = [] for page in pages: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text() page_content.append((x, y, text)) return page_content
def pdf_to_text(pdfname): # PDFMiner boilerplate rsrcmgr = PDFResourceManager() codec = 'utf-8' laparams = LAParams() # Extract text fp = open(pdfname, 'rb') no=1 i=0 text ="" flag=False data = pd.DataFrame([], columns=['page', 'text']) for page in PDFPage.get_pages(fp): sio = StringIO() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) text = sio.getvalue() text = re.sub('[^A-Za-z0-9 \n]+', '', text) sio.close() tokenized = text.split() if len(tokenized) > 0 and tokenized[-1].isdigit() and len(tokenized[-1]) > 4: page_number = int(tokenized[-1]) if flag==False and (page_number ==1 or page_number ==2): i=page_number flag=True if i>0: data = data.append({'page': i, 'text': text}, ignore_index=True) if i!=0: i+=1 fp.close() # Cleanup device.close() return data
def pdf_extractor3(path, vectors=False): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() creator = "Unknown" current_page_number = 1 paragraph_repo = {} vector = {} Classified = False for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): text = '' interpreter.process_page(page) text = retstr.getvalue() retstr.truncate(0) text = re.sub(u'(\u0000)', "", text) paragraph_repo[str(current_page_number)] = text if vectors: vector[str(current_page_number)] = vectorizer(text, lang=detect(text)) current_page_number += 1 fp.close() device.close() retstr.close() if vectors: return Classified, creator, paragraph_repo, vector else: return Classified, creator, paragraph_repo
def get_page_analysis(infile, pageno, pscript5_mode): rman = pdfminer.pdfinterp.PDFResourceManager(caching=True) if pdfminer.__version__ < '20200402': # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395 disable_boxes_flow = 2 else: disable_boxes_flow = None dev = TextPositionTracker( rman, laparams=LAParams( all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow ), ) interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev) patcher = None if pscript5_mode: patcher = patch.multiple( 'pdfminer.pdffont.PDFType3Font', spec=True, get_ascent=PDFType3Font__PScript5_get_ascent, get_descent=PDFType3Font__PScript5_get_descent, get_height=PDFType3Font__PScript5_get_height, ) patcher.start() try: with Path(infile).open('rb') as f: page = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0) interp.process_page(next(page)) except PDFTextExtractionNotAllowed as e: raise EncryptedPdfError() from e finally: if patcher is not None: patcher.stop() return dev.get_result()
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): # For some reason PDFMiner chokes on pages that have a single, # but detailed image in them. Work around by skipping pages with # a large media box mediabox = [0, 0, 0, 0] try: mediabox = page.mediabox except AttributeError as e: pass try: mediabox_pixels = mediabox[2] * mediabox[3] except IndexError as e: mediabox_pixels = 0 if mediabox_pixels <= settings.PDF_MAX_MEDIABOX_PIXELS: print "Processing page %s" % (page, ) interpreter.process_page(page) else: print "Skipped page %s" % (page, ) infile.close() converter.close() text = output.getvalue() output.close() return text
def convert_pdf(input_file, format='text', codec='utf-8'): """Convert PDF file to text or html. Args: input_file (str): Input PDF file. format (str): Format text or html. codec (str): Codec for encode the text. Returns: str: Return text or html from PDF file. """ manager = PDFResourceManager() output = BytesIO() laparams = LAParams() if format == 'text': converter = TextConverter(manager, output, codec=codec, laparams=laparams) elif format == 'html': converter = HTMLConverter(manager, output, codec=codec, laparams=laparams) with open(input_file, 'rb') as f1: interpreter = PDFPageInterpreter(manager, converter) for page in PDFPage.get_pages(f1, caching=True, check_extractable=True): interpreter.process_page(page) converter.close() text = output.getvalue() output.close() return text.decode()
def main(argv) : #输出文件名,这里只处理单文档,所以只用了argv[1] outfile = argv[1] + '.txt' args = [argv[1]] debug = 0 pagenos = set() password = '' maxpages = 0 rotation = 0 codec = 'utf-8' #输出编码 caching = True imagewriter = None laparams = LAParams() # PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) outfp = file(outfile,'w') #pdf转换 device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) for fname in args: fp = file(fname,'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) #处理文档对象中每一页的内容 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) : page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def parse_pdf(path, print_lines=False): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() u = text.split("\n") if print_lines: # print line numbers it = iter(u) i = 0 try: while True: print(i, next(it)) i += 1 except StopIteration: pass return u
def Data(path): for paths in path: pdf = PyPDF2.PdfFileReader(open(paths, "rb")) file = open(paths, 'rb') num_of_pages = pdf.getNumPages() for i in range(num_of_pages): pages = [i] page_no = set(pages) manager = PDFResourceManager() io = StringIO() encoder = 'utf-8' params = LAParams() converter = TextConverter(manager, io, codec=encoder, laparams=params) interpreter = PDFPageInterpreter(manager, converter) password = "" maxpages = 0 caching = True text = "" for page in PDFPage.get_pages(file, page_no, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = io.getvalue() text = clean(text) if text != '' and len(text.split()) > 10: db_client.local.PDFData.insert_one({ 'data': text, 'keyword': paths.split('_')[0].lower() })
def pdf2txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() cwd = os.getcwd() hard_path=cwd [dummy,name]=os.path.split(path) print name name_txt=name[:-4]+'.txt' print name_txt path = os.path.join(os.path.normpath(hard_path),name_txt) print path text_file = open(os.path.normpath(path),'w') text_file.write(text) text_file.close() return text
def convert_pdf_to_txt(self): """Directly from stackoverflow, some edits. Converts PDF(s) to text if found """ rsrcmgr = PDFResourceManager() retstr = io.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for path in list(self.listings): try: if path.endswith(".pdf"): # Cheks if this is a pdf file fp = open(path, 'rb') for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() self.text = self.text + " " + text except: print( "Error encountered when trying to parse PDF as text, skipping " + path + "...") device.close() retstr.close()
def extract_text_from_pdf(path, filename): with open(path+filename, 'rb') as fh: # iterate over all pages of PDF document for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): # creating a resoure manager resource_manager = PDFResourceManager() # create a file handle fake_file_handle = io.StringIO() # creating a text converter object converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) # creating a page interpreter page_interpreter = PDFPageInterpreter( resource_manager, converter ) # process current page page_interpreter.process_page(page) # extract text text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close()
def pdf_to_txt(path: str) -> list: """ PDF ファイル読み込み、パースしてテキストを返す Args: path (str): PDF ファイルのパス Returns: list: PDF をパースしたテキストを改行で区切ったリスト """ resource_manager = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # Trueにすることで綺麗にテキストを抽出できる device = TextConverter(resource_manager, retstr, codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) maxpages = 0 caching = True pagenos = set() fstr = '' for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str break fp.close() device.close() retstr.close() list_text = fstr.split('\n') return list_text
def convert(self, fileName): logging.info("PDFConverter.convert STARTS") resourceManager = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(resourceManager, retstr, codec=codec, laparams=laparams) filename = os.path.abspath(__file__ + '/../../../../../temp/' + fileName) fp = open(filename, "rb") interpreter = PDFPageInterpreter(resourceManager, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() os.remove(filename) logging.info("PDFConverter.convert ENDS") return text
def extract_from_pdf(file, file_path): text = "" ''' if file_path is not None: text = textract.process(file_path, method='tesseract', language='eng') ''' if text != "": return text ''' else: pdfReader = PyPDF2.PdfFileReader(file) pagesCount = pdfReader.numPages for i in range(0, pagesCount): pageObj = pdfReader.getPage(i) text += pageObj.extractText() ''' resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() if text: return text return text
def extract_text_by_page(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None): """ Parse and return the text contained in each page of a PDF file. Taken from https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py#L90-L123 and adapted to return the text of each page separately as a dictionary obj. :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: a dict containing the text from each page (keys = page numbers) """ if laparams is None: laparams = LAParams() text_by_page = {} with open_filename(pdf_file, "rb") as fp: rsrcmgr = PDFResourceManager() pages_iterable = PDFPage.get_pages(fp, page_numbers, maxpages=maxpages, password=password, caching=caching) if page_numbers is None: tuples_iterable = enumerate(pages_iterable) else: tuples_iterable = zip(page_numbers, pages_iterable) for page_num, page in tuples_iterable: # print('Processing page_num', page_num) with StringIO() as output_string: device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) text_by_page[page_num] = output_string.getvalue() return text_by_page
def get_xml_data(self): """Store XML representation fo file""" rm = PDFResourceManager(caching=True, font_correctors=self.font_correctors) laparams = LAParams() outfp = open(self.xmlfile, "wb") device = XMLConverter(rm, outfp, codec="UTF-8", laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rm, device) infile = open(self.pdffile, "rb") pagenos = set() maxpages = 0 rotation = 0 password = "" for page in PDFPage.get_pages(infile, pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) self.font_metrics = {} for font in list(rm._cached_fonts.values()): try: self.font_metrics[font.fontname] = { "bbox": font.bbox, "descent": font.descent } except AttributeError: print((dir(font))) infile.close() device.close() outfp.close()
def get_pdf_formatted_txt(path, pwd): """Extract the text from the PDF file. Parameters ---------- path : string Filepath of the PDF. pwd : string Password for the encrypted PDF file. Returns ------- text : string extracted string form PDF. """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = pwd maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() # text1 = text.decode().split('\n') text1 = set(re.split("\n|\n\n|,", text.decode())) text1 = list(text1) text2 = [] for i in range(0, len(text1)): st = text1[i] text2.append(st) listt = st.split(' ') text2.extend(listt) text1 = text2 text1 = [x.lower() for x in text1] fp.close() device.close() retstr.close() return text1
def pdf_to_text(pdfname): # PDFMiner boilerplate rsrcmgr = PDFResourceManager() sio = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, sio, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = open(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = sio.getvalue() # Cleanup device.close() sio.close() return text
def request_pdf(url, case_id, court_name): try: response = requests.request("GET", url, proxies=proxy_dict) if response.status_code == 200: res = response.text if "no data found" in res.lower(): logging.error("No data for: " + str(case_id)) return "NULL" file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(case_id) + ".pdf" fw = open(file_path, "wb") fw.write(response.content) text_data = "" pdf_manager = PDFResourceManager() string_io = StringIO() pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text) for page in PDFPage.get_pages(open(file_path, 'rb')): interpreter.process_page(page) text_data = string_io.getvalue() file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(case_id) + ".txt" fw = open(file_path, "w") fw.write(str(text_data)) return str(text_data) else: logging.error("Failed to get text file for: " + str(case_id)) return "NULL" except Exception as e: logging.error("Failed to get pdf file for: " + str(case_id) + ". Error: %s", e) return "NULL"
def read_pdf(self, path): rsrcmgr = PDFResourceManager() retstr = io.StringIO() device = TextConverter(rsrcmgr, retstr) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() text = " ".join(text.replace(u"\xa0", " ").strip().split()) text = text.replace('\uf0b7', '').lower() fp.close() device.close() retstr.close() return text