def pdf2html(pdfPath, htmlPath): '''按照tool中pdf2txt的方法,写的函数''' caching = True rsrcmgr = PDFResourceManager(caching=caching) scale = 1 layoutmode = 'noraml' laparams = LAParams() outdir = None debug = False outfp = io.open(htmlPath, 'wt', encoding='utf-8', errors='ignore') device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) pagenos = set() maxpages = 0 password = '' fp = io.open(pdfPath, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() outfp.close()
def convertPDF(pdf_path, codec='ascii'): """ Takes path to a PDF and returns the text inside it as string pdf_path: string indicating path to a .pdf file. Can also be a URL starting with 'http' codec: can be 'ascii', 'utf-8', ... returns string of the pdf, as it comes out raw from PDFMiner """ if pdf_path[:4] == 'http': print 'first downloading %s ...' % (pdf_path,) urllib.urlretrieve(pdf_path, 'temp.pdf') pdf_path = 'temp.pdf' rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(pdf_path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def read_pdf(pdf): # resource manager rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() # device device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdf) device.close() content = retstr.getvalue() retstr.close() # 获取所有行 lines = str(content).split("\n") units = [1, 2, 3, 5, 7, 8, 9, 11, 12, 13] header = '\x0cUNIT ' # print(lines[0:100]) count = 0 flag = False # text = open('words.txt', 'w+') for line in lines: if line.startswith(header): flag = False count += 1 if count in units: flag = True print(line)
def pdf_to_text(pdf_string): """ :param pdf_string: The PDF file contents. :return: A string with the content of the PDF file. """ rsrcmgr = PDFResourceManager(caching=True) laparams = LAParams() output = StringIO.StringIO() device = TextConverter(rsrcmgr, output, codec='utf-8', laparams=laparams) document_io = StringIO.StringIO(pdf_string) pagenos = set() try: process_pdf(rsrcmgr, device, document_io, pagenos, check_extractable=False) except PDFSyntaxError: return u'' device.close() output.seek(0) return output.read().decode('utf-8')
def pdf_to_html(scraped_pdf_data): from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import HTMLConverter from pdfminer.layout import LAParams import StringIO fp = StringIO.StringIO() fp.write(scraped_pdf_data) fp.seek(0) outfp = StringIO.StringIO() layoutmode='normal' scale=2 charmargin=0.5 linemargin=0.5 wordmargin=0.3 boxesflow=0 rsrcmgr = PDFResourceManager() device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) process_pdf(rsrcmgr, device, fp) device.close() t = outfp.getvalue() outfp.close() fp.close() return t
def read_file(): read_360_ip = r.smembers("whitelist_360") list_ip = [] f = open(cmd_path_spider, "at") with open(cmd_path + file_name, "rb") as my_pdf: rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, my_pdf) device.close() content = retstr.getvalue() retstr.close() for line in str(content).split("\n"): if not line: continue if '.' not in line: continue line = line.strip() if re.search('[a-z]', line): continue # 操作redis r.sadd("whitelist_360", line) list_ip.append(line) #print(list_ip) for ip in read_360_ip: ip = ip.decode(encoding='utf-8') if not ip in list_ip: r.srem("whitelist_360", ip) f.write(now_time + ": " + "360spider: " + ip + "\n") f.close()
def pdfMine(fp): """ Input: file handle to a PDF file Output: a list of tuples with (Primer Name, Primer Sequence) """ str_io = StringIO.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) device = TextConverter(rsrcmgr, str_io, codec='utf-8', laparams=laparams) process_pdf(rsrcmgr, device, fp, pagenos=set(), maxpages=0, password='', caching=True, check_extractable=True) pdf_string = str_io.getvalue() blocks = pdf_string.split('VC00') primers = [] for b in blocks: for x in re.findall('EA[\s\n\d\.\W]+(\D+[\w\d\s\S]+)\nUMO', b): primers.append(x.split('\n')) return primers
def read_pdf(pdf): try: # resource manager rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() # device device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdf) device.close() content = retstr.getvalue() retstr.close() # 獲取所有行 context = str(content).split("\n") # print(context) for lines in context: if '@' in lines: return lines detail = lines.split(' ') for s in detail: if '@' in s: #print(s) return s except: print('file error')
def fetch_past_legislator(self, year, chamber, url, name): name = name.replace('\n','') spaces = re.compile("\s+") name = spaces.sub(' ', name) url = "http://www.legislature.mi.gov/%s" % url.replace('../', '') with self.urlopen_context(url) as the_pdf: # UGH! What a useful yet convoluted library. outtext = StringIO() rsrc = PDFResourceManager(CMapDB()) device = TextConverter(rsrc, outtext, codec='ascii', laparams=LAParams()) process_pdf(rsrc, device, StringIO(the_pdf), set()) outtext.seek(0) text = outtext.read() # I should just add a pdf_context that wraps this :-\ res = re.findall(r'State\s+(?:Senator|Representative)\n(.*?)\n([R|D]).*?[\n]*(\d+)(?:st|nd|rd|th)', text) if res == []: print text raise Exception("Some fragile code broke.") name = res[0][0] (first, middle, last, suffix) = self.parse_name(name) leg = Legislator(year, chamber, res[0][2], name, first , last, middle, res[0][1], suffix=suffix) self.save_legislator(leg)
def __call__(self, stream): """Extract text from input stream""" # Prepare pdf extraction outfp = StringIO() rsrcmgr = PDFResourceManager(caching=self.caching) device = TextConverter( rsrcmgr, outfp, codec=self.encoding, laparams=self.laparams, ) # Extract text process_pdf( rsrcmgr, device, stream, set(), # pagenos maxpages=0, password=self.password, caching=self.caching, check_extractable=True, ) # Output text = outfp.getvalue() outfp.close() if self.normalize_spaces: return re.sub(r' +', ' ', text) else: return text
def _convert_pdf_to_text(self, password=None): input_pdf = self.cvFile if password is not None: self.cvFilePasswd = password pagenos = range(0, 30) maxpages = pagenos.__len__() layoutmode = 'normal' codec = 'utf-8' scale = 1 outtype = 'txt' laparams = LAParams() laparams.all_texts = True laparams.showpageno = True outputPath = self.scratchDir inputPath = os.getcwd() if os.path.exists(input_pdf): inputPath = os.path.dirname(input_pdf) input_filename = os.path.basename(input_pdf) input_parts = input_filename.split(".") input_parts.pop() randomStr = int(time.time()) output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt" self.cvTextFile = output_filename outfp = file(output_filename, 'w') rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(input_pdf, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True) fp.close() device.close() outfp.close() return (0)
def pdf_to_text(file_pointer): # debug option debug = 0 CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug pagenos = set() password = '' maxpages = 0 codec = 'utf-8' laparams = LAParams() rsrcmgr = PDFResourceManager() outfp = StringIO.StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, file_pointer, pagenos, maxpages=maxpages, password=password) text_string = outfp.getvalue() outfp.close() device.close() return text_string
def convert_pdf(path): from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, process_pdf, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO from pdfminer.converter import PDFPageAggregator rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') parser = PDFParser(fp) process_pdf(rsrcmgr, device, fp) fp.close() device.close() text_str = retstr.getvalue() retstr.close() serialize_object(text_str, 'corpus.pkl') tokenized_text = tonkenier(text_str) serialize_object(tokenized_text, 'tokenized_corpus.pkl') return tokenized_text
def _convertpdf(self,filename): #try: if True: success = True pdfstr = "" if self.DEBUG: print "Converting PDF to text ..." rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(filename, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() pdfstr = retstr.getvalue() retstr.close() if self.DEBUG: print "PDF to text conversion complete." #except: # success = False # pdfstr = "" return pdfstr,success
def read_pdf(pdf): # resource manager rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() # device device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdf) device.close() content = retstr.getvalue() retstr.close() # 获取所有行 lines = str(content).split("\n") word_lst = [] for stri in lines: match_pattern = re.findall(r'\b[a-z]{3,15}\b', stri) for word in match_pattern: # count = frequency.get(word, 0) # frequency[word] = count + 1 word_lst.append(word) # print(word_lst) return word_lst
def _pdf_to_text(path): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') retVal = (txt,True) retstr.close() except Exception,e: #print str(e) #print "\tERROR: PDF is not formatted correctly, aborting." retVal = ("", False) pass
def to_txt(pdf_path): input_ = file(pdf_path, 'rb') output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) process_pdf(manager, converter, input_) return output.getvalue()
def convertPDF(pdf_path, codec='ascii'): """ Takes path to a PDF and returns the text inside it as string pdf_path: string indicating path to a .pdf file. Can also be a URL starting with 'http' codec: can be 'ascii', 'utf-8', ... returns string of the pdf, as it comes out raw from PDFMiner """ if pdf_path[:4] == 'http': print 'first downloading %s ...' % (pdf_path, ) urllib.urlretrieve(pdf_path, 'temp.pdf') pdf_path = 'temp.pdf' rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(pdf_path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def readPDF(self, pdfFile): try: print(pdfFile) time.sleep(5) rsrcmgr = PDFResourceManager() # Creates the resource manager # resource_mang = PDFResourceManager() retstr = StringIO( ) # string object for the representation of the pdf # string represetnation from string input and output module laparams = LAParams() # Parameters Object Creation device = TextConverter( rsrcmgr, retstr, laparams=laparams) # Creating the device for the conversion process_pdf( rsrcmgr, device, pdfFile ) # Process the specific pdf, to convert into string representations device.close() # Closes the device. # print(retstr) # Debuggin # Decoded value is returned here UTF-8 content = retstr.getvalue() # gets the text from the string object # print(content)5 return content # Returns the content where its called except Exception as Ex: print( "While reading the file , there was an error in the function Readodf as :", Ex) # printing the exception
def getPdfContent(pdfFile): input_ = file(pdfFile, 'rb') output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) process_pdf(manager, converter, input_) return output.getvalue()
def read_file(path): filename, file_extension = os.path.splitext(path) if file_extension == '.pdf': with open(path, "rb") as pdf: # resource manager rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() # device device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdf) device.close() content = retstr.getvalue() retstr.close() return str(content) elif file_extension == '.doc': word = win32com.client.Dispatch("Word.Application") word.visible = False wb = word.Documents.Open(path) doc = word.ActiveDocument return doc.Range().Text elif file_extension == '.docx': temp = textract.process(path) return temp.decode()
def convert(fp): showpageno = True pagenos = set() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=False) retstr = StringIO2() retstr.encoding = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, scale=1, layoutmode='normal', laparams=laparams, outdir=None, debug=False) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=0, password='', caching=False, check_extractable=True) device.close() return retstr.getvalue()
def pdf_to_text(pdf_file): password = '' pagenos = set() maxpages = 0 caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) with io.StringIO() as output: out_device = TextConverter(rsrcmgr, output, laparams=laparams) try: process_pdf( rsrcmgr, out_device, pdf_file, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ) except PSEOF: raise ValueError("Invalid PDF") except PDFEncryptionError: raise ValueError("Bad encryption") return output.getvalue()
def outToHtml(self, html): # pdf File=file('C:/Python27/Scripts/PlentyReads/mybooks/uploads/4ABSLIST_OF_FIGABBREVATIONS.pdf', 'rb') # self.__pdf.getPath() # # path = settings.MEDIA_URL+Pdf.objects.get(id=Pdf.__getId(self.)) # pdfFile=file(path ,'rb') BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) MEDIA_ROOT = os.path.join(BASE_DIR,'mybooks\\') pdfFile=file(MEDIA_ROOT + str(self.__pdf.url), 'rb') rsrcmgr = PDFResourceManager(caching=self.caching) device = HTMLConverter(rsrcmgr, html, codec=self.codec, scale=self.scale,layoutmode=self.layoutmode, # laparams=self.laparams, outdir=self.outdir) laparams=self.laparams) process_pdf(rsrcmgr, device, pdfFile, self.pagenos, maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=True) # PDFPage.get_pages(fp=pdfFile, maxpages=self.maxpages, # pagenos=self.pagenos, password=self.password, caching=self.caching, # check_extractable=True) pdfFile.close() # html.pdf=self.pdf return html
def to_txt(infile: str, outfile: str): """ Convert a pdf file to txt. :param infile: pdf file path; :param outfile: txt file path; :return: txt file path; """ caching = True rsrcmgr = PDFResourceManager(caching=caching) codec = 'utf-8' pagenos = set() maxpages = 0 password = '' laparams = LAParams() laparams.word_margin = float(0) laparams.line_margin = float(1) outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') device = TextConverter(rsrcmgr, outfp, laparams=laparams) fp = io.open(infile, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return outfile
def parse_pdf_to_txt(pdf_handle, write_file): pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() #laparams.all_texts = True laparams.detect_vertical = True # 创建pdf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager(caching=caching) print("ready to open out file ........") with open(write_file, "wt", encoding=codec, errors='ignore') as outfp: device = XMLConverter(rsrcmgr, outfp, laparams=laparams) print("ready to converte pdf to xml ........") process_pdf(rsrcmgr, device, pdf_handle, pagenos, maxpages=maxpages, password='', caching=caching, check_extractable=True) device.close()
def getTexts(self): try: password = '' pagenos = set() maxpages = 0 codec = 'gb2312' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt', 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname = self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt', 'rb') test = infp.read() infp.close() os.remove('temppdf.txt') self.text = test return "ok" except Exception, e: return e
def dump(self, pdffilename): ret = None rsrc = PDFResourceManager() outfp = cStringIO.StringIO() try: device = TextConverter(rsrc, outfp, codec='utf-8', laparams=LAParams()) try: fp = file(pdffilename, 'rb') try: process_pdf(rsrc, device, fp, set(), maxpages=0, password='') ret = outfp.getvalue() finally: fp.close() finally: device.close() finally: outfp.close() return ret
def getTexts(self): try: password ='' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt','w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname= self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt','rb') test=infp.read() infp.close() os.remove('temppdf.txt') self.text=test return "ok" except Exception,e: return e
def convert(src, des): for root, dirs, files in os.walk(src): for file in files: try: if file.endswith(".pdf"): if not file.startswith("._"): outfile = des codec = 'utf-8' caching = True rsrcmgr = PDFResourceManager(caching=caching) if outfile: outfp = open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False device = TextConverter(rsrcmgr, outfp) fname = os.path.join(root, file) fp = open(fname, 'rb') process_pdf(rsrcmgr, device, fp, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close() test=open(outfile).read() invoice=find_invoice_number(test) date= find_date(test) due_amount = find_amount(test) print("{ File Name: ", file, "Invoice Number: ", invoice, "Invoice Date: ", date, "Due Amount: Rs ", due_amount,"}") except: print('An error occured.')
def get(self): self.response.headers['Content-Type'] = 'application/javascript' ISBN = self.request.get('ISBN') callback = self.request.get('callback') book = Books.get_by_key_name(ISBN) path = book.FilePath rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() content = retstr.getvalue() retstr.close() content = content.split('\n') self.response.write(callback + '({ "content" : [') c = 0; for string in content: if c == 0: c = 1 else: self.response.write(",\n") self.response.write(' "' + string + '"') self.response.write(']})')
def main(argv): debug = 0 password = '' pagenos = set() maxpages = 0 outfile = None outtype = None codec = 'utf-8' pageno = 1 caching = True laparams = LAParams() CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) #outfp = sys.stdout test="" outfp = test device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname="test.pdf" fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() #outfp.close() print test return
def pdf2xml(infile): ''' Return a string of XML representation for given PDF file handle. Uses pdfminer to do the conversion and does some final post-processing. ''' outfile = StringIO() # Empirically determined... laparams = LAParams() laparams.char_margin = 0.4 # See pdf2txt.py rsrcmgr = PDFResourceManager(caching=False) device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) if page_api: for page in PDFPage.get_pages(infile, set()): interpreter.process_page(page) else: process_pdf(rsrcmgr, device, infile, set()) infile.close() return outfile.getvalue().replace("\n", "")
def to_txt(pdf_path): input_ = open(pdf_path, 'rb') output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) process_pdf(manager, converter, input_) return output.getvalue()
def extract_from_pdf(file_name): # disable logging, because pdfminer produces a lot of warnings logger = logging.getLogger() logger.disabled = True f = open(file_name, "rb") laparams = LAParams() try: rsrcmgr = PDFResourceManager(caching=True) out = io.StringIO() device = TextConverter(rsrcmgr, out, laparams=laparams) process_pdf(rsrcmgr, device, f, set(), maxpages=1, check_extractable=True) s = unligaturify(str(out.getvalue())) out.close() tt = " ".join(s.replace("\n", " ").replace(" ", " ").split(" ")) """ extract title """ tmp = s.split("\n")[0:5] idx = tmp.index("") title = " ".join(tmp[0:idx]) f.close() meta = {"title": title.strip(), "keywords": extract_key_words(tt)} return meta except Exception as e: lError(e) return {"title": "", "keywords": []}
def __extract_extra__(request, item_id=None): if not request.user.is_authenticated(): return HttpResponse('Please sign in first') from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.converter import TextConverter from cStringIO import StringIO laparams = LAParams() outtype = 'text' laparams.char_margin = 1.0 laparams.line_margin = 0.3 laparams.word_margin = 0.2 codec = 'utf-8' caching = True if item_id: all_items = Item.objects.filter(id=item_id) else: all_items = Item.objects.all() for item in all_items: # Don't extract if no PDF exists; or if we already have search text if not item.pdf_file or item.other_search_text: continue rsrcmgr = PDFResourceManager(caching=caching) outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = item.pdf_file.file try: process_pdf(rsrcmgr, device, fp, pagenos=set(), maxpages=0, password='', caching=caching, check_extractable=True) except AssertionError: logger.warning('FAILED in completely PDF index "%s"' % item.title) return HttpResponse('FAILED in completely PDF index "%s"' \ % item.title) else: logger.debug('Full PDF index of item "%s"' % item.title) finally: fp.close() device.close() outfp.seek(0) page_text = outfp.read() outfp.close() item.other_search_text = page_text item.save() return HttpResponse('Full PDF indexed for item "%s"' % item.title)
def readPDF(bitfile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr,retstr,laparams=laparams) process_pdf(rsrcmgr,device,BytesIO(bitfile)) content = retstr.getvalue() return content
def pdf2text(filename): rsrcmgr = PDFResourceManager() device = TextExtractor(rsrcmgr) fp = io.open(filename, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() return device.text
def __call__(self, rev, contenttype=None, arguments=None): rsrcmgr = PDFResourceManager() device = UnicodeConverter(rsrcmgr, laparams=LAPARAMS) try: process_pdf(rsrcmgr, device, rev) return device.read_result() finally: device.close()
def parse(self): self.fp = open(self.filename, 'rb') self.rsrcmgr = PDFResourceManager(caching=caching) self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) process_pdf(self.rsrcmgr, self.device, self.fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) self.outfp.seek(0) return BeautifulSoup.BeautifulSoup("".join(self.outfp.readlines()))
def extract_content(fp, encoding): content = StringIO() # not BytesIO rsrcmgr = PDFResourceManager(caching=True) device = TextConverter(rsrcmgr, content) pagenos = set() process_pdf(rsrcmgr, device, fp, pagenos) device.close() content.seek(0) return content.getvalue().encode('utf-8')
def get_pdf_io(pdfname,logger=None): """ pdf2txt and return a StringIO """ if(logger is None): logger=createLog(logname="util")#,level=loglevel) from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option #outfile = None #outtype = None #outdir = None #layoutmode = 'normal' codec = 'utf-8' #pageno = 1 #scale = 1 caching = True #showpageno = True laparams = LAParams() PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) #outtype = 'text' outfp = StringIO() #outfp = sys.stdout device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(pdfname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) """ x=outfp.getvalue() print len(x) print x """ fp.close() device.close() #outfp.close() outfp.seek(0) return outfp
def _process(self, fp, device): process_pdf(self.resmgr, device, fp, self.options.pagenos, maxpages=self.options.maxpages, password=self.options.password, caching=self.options.caching, check_extractable=True)
def pdf2txt(fp, pagenos=set(), caching=True, codec = 'utf-8', password=''): outfp = cStringIO.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp, pagenos, password=password, caching=caching, check_extractable=True) return outfp.getvalue()
def to_xml(self, filename): src = file(filename+".pdf",'rb') out = file(filename+".xml", 'w') rsrc = PDFResourceManager() converter = XMLConverter(rsrc, out, codec='utf-8', laparams=LAParams()) process_pdf(rsrc, converter, src, 0, maxpages=0, password='') src.close out.close converter.close
def run(self): rsrcmgr = PDFResourceManager(caching=self._caching) if not self._outtype: self._outtype = "text" if __name__ == "__main__": if self._outfile: if self._outfile.endswith(".htm") or self._outfile.endswith(".html"): self._outtype = "html" elif self._outfile.endswith(".xml"): self._outtype = "xml" elif self._outfile.endswith(".tag"): self._outtype = "tag" if __name__ == "__main__": if self._outfile: outfp = file(self._outfile, "w") else: outfp = sys.stdout else: from cStringIO import StringIO outfp = StringIO() if self._outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams) elif self._outtype == "xml": device = XMLConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams) elif self._outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=self._codec, scale=self._scale, layoutmode=self._layoutmode, laparams=self._laparams, ) elif self._outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=self._codec) else: return usage() for fname in self._args: fp = file(fname, "rb") process_pdf( rsrcmgr, device, fp, self._pagenos, maxpages=self._maxpages, password=self._password, caching=self._caching, check_extractable=True, ) fp.close() device.close() if __name__ == "__main__": outfp.close() else: return outfp.getvalue()
def parse_pdf(self, test_parse=False): """ Parse a PDF and return text contents as an array """ dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file) # input options pagenos = set() maxpages = 0 # output option codec = "utf-8" caching = True laparams = LAParams() laparams.char_margin = 8.0 laparams.word_margin = 2.0 rsrcmgr = PDFResourceManager(caching=caching) try: outfp = file(self.text_file, "w") except IOError as io_error: raise DTPOFileError(self.text_file, 0, str(io_error)) try: fp = file(self.source_file, "rb") except IOError as io_error: raise DTPOFileError(self.source_file, 0, str(io_error)) try: device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True) except PDFException as pdf_error: message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error)) raise DTPOFileError(self.source_file, 0, message) except Exception as exception: message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception)) raise DTPOFileError(self.source_file, 0, message) fp.close() device.close() outfp.close() # Got the PDF converted = now get it into an array self.file_array = [] for line in open(self.text_file): self.file_array.append(line) # Remove the last entry - it's always '\x0c' if len(self.file_array) > 0: del self.file_array[-1] # Remove the outfile if not test_parse: os.remove(self.text_file)
def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content
def _process(self, fp, device): process_pdf( self.resmgr , device , fp , self.options.pagenos , maxpages=self.options.maxpages , password=self.options.password , caching=self.options.caching , check_extractable=True )
def pdf_text(filename): try: text = io.StringIO() rsrc = PDFResourceManager() device = TextConverter(rsrc, text, laparams=LAParams()) process_pdf(rsrc, device, open(filename, 'rb'), None, maxpages=1, password='') device.close() return text.getvalue() except (PDFSyntaxError, PDFTextExtractionNotAllowed, PSEOF): return ""
def decodepdf(fp, debug = False): with StringIO() as outfp: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp) logging.disable(logging.WARNING) if debug: print("processing pdf begin ({0})".format(timestr())) process_pdf(rsrcmgr, device, fp) if debug: print("processing pdf ended ({0})".format(timestr())) logging.disable(logging.NOTSET) return outfp.getvalue()
def get_pdf_content(path): laparams = LAParams() rsrc = PDFResourceManager() outfp = StringIO() try: #TODO: detect the encoding of the PDF device = TextConverter(rsrc, outfp, codec="cp1252", laparams=laparams) process_pdf(rsrc, device, codecs.open(path)) except (PDFSyntaxError, PDFTextExtractionNotAllowed): print "Error processing PDF file: " + path return outfp.getvalue()
def convert(data): from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from StringIO import StringIO pdfdata = StringIO(data) htmldata = StringIO() man = PDFResourceManager() conv = TextConverter(man, htmldata) process_pdf(man, conv, pdfdata) data = htmldata.seek(0) or htmldata.read() return data