def getPageLayouts(f1): '''Takes a pdf file object, f1, extracts the text-like objects, and returns''' try: '''The parser and doc pair for a "pipe" of sorts''' with open(fpath, 'rb') as f1: parser = PDFParser(f1) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pss_wd) # can we extract text? if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_layouts = [] for page in doc.get_pages(): ''' I *think* we're actually calling on fp here, and not some stored data; the idea is that .pdf files are "too big and complicated" to load all at once, so why not just parse what you need when you need it? ''' interpreter.process_page(page) # receive the LTPage object for the page page_layouts.append(device.get_result()) except IOError: raise IOError, "issue with loading file, please try again" finally: f1.close() return page_layouts
def load( self, open_file ): self.fields = {} self.text= {} # Create a PDF parser object associated with the file object. parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for pgnum, page in enumerate( doc.get_pages() ): interpreter.process_page(page) if page.annots: self._build_annotations( page ) txt= self._get_text( device ) self.text[pgnum+1]= txt
def create_pages(self): """Apply parsing function, returning the results""" from public_project.models import Page # create a parser object associated with the file object parser = PDFParser(self.pdf_file) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization pdf_pwd = '' doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result doc_pages = self._parse_pages(doc) i = 1 for doc_page in doc_pages: page = Page( document=self.document, number=i, content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'), ) page.save() i = i + 1
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def WithPdf(self, pdfdoc, password, fn, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdfdoc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization if password: self.password = password doc.initialize(self.password) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def pdf_to_text(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter #<-- changed from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter rsrc = PDFResourceManager() outfp = StringIO() device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) print "There are: " + str(len(list(doc.get_pages()))) + " pages" for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def getData(self): doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) except: return "error" parser.close() fp.close() #try: # metadata = resolve1(doc.catalog['Metadata']) # return "ok" #except: # print "[x] Error in PDF extractor, Metadata catalog" try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info if self.raw == None: return "Empty metadata" else: return "ok" except Exception,e: return e print "\t [x] Error in PDF extractor, Trailer Info"
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages = dict(enumerate(doc.get_pages())) for num, page in pages.iteritems(): if pagenums and (num not in pagenums): continue interpreter.process_page(page) if maxpages and maxpages <= num + 1: break return pages
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None): if len(args) > 1: prefix = fileOrUrl + ':' fp = None if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'): request = urllib2.Request(fileOrUrl) if basicauth: request.add_header('Authorization', 'Basic ' + basicauth) fobj = urllib2.urlopen(request) pdfdata = fobj.read() fobj.close() fp = StringIO.StringIO(pdfdata) else: fp = open(fileOrUrl, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() fp.close() if textmode: for obj in doc.info: for (name, val) in obj.iteritems(): print '{0}:{1}={2}'.format( fileOrUrl, name, val ) else: val = doc.info if type(val) is list and len(val) == 1: val = val[0] print prefix + str(val)
def ParseAllPages(self, filepath): # Open a PDF file. self.filepath = filepath fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page)
def pdf_to_csv(filename): # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def read_invoice_pdfminer3k(pdfFile): fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. invoice_text = "" for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): invoice_text += lt_obj.get_text() # Extract client info from the string extracted from pdf client = extract_info(invoice_text, client_start, client_end) print("client :" + client) # Extract invoice no from the pdf file name invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end) print("invoice no :" + invoice_no) # Pass the client info and invoice no to the method which writes to excel file write_excel(client, invoice_no)
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def getPDFMetadata(path): result = {} fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() result = doc.info if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() try: result.update( metadata ) # The raw XMP metadata except: pass try: result.update( xmp_to_dict(metadata) ) except: pass return result[0]
def pdf_function(pdf_doc, password='', *args, **kwargs): result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization doc.initialize(password) if doc.is_extractable: # apply the function and return the result result = function(doc, *args, **kwargs) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
def pdf_isvalid(filelike): ''' returns True if valid pdf, else False @param filelike: filelike object, seekable ''' logger = logging.getLogger() isvalid = False filelike.seek(0) if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC: return False else: filelike.seek(0) try: parser = PDFParser(filelike) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') if doc.is_extractable: isvalid = True except PDFException as excobj: logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj)) isvalid = False filelike.seek(0) return isvalid
def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ file_pointer = open(self.path, 'rb') parser = PDFParser(file_pointer) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() metadata = Metadata() for i in doc.info: metadata.add(i) if 'Metadata' in doc.catalog: xmp_metadata = resolve1(doc.catalog['Metadata']).get_data() xmp_dict = xmp_to_dict(xmp_metadata) #Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") file_pointer.close() self.metadata = metadata return metadata
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def parse_pdf(pdf_url): remote_file = urllib.request.urlopen(pdf_url).read() memory_file = io.BytesIO(remote_file) parser = PDFParser(memory_file) doc = PDFDocument() parser.set_document(doc) #Warning sometimes, error in pdf? doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ret = [] # Process each page contained in the document. for pageIdx, page in enumerate(doc.get_pages()): ret.append([]) interpreter.process_page(page) layout = device.get_result() for idx, lt_obj in enumerate(layout): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if len(lt_obj.get_text().strip()) > 0: ret[pageIdx].append((lt_obj.get_text().splitlines())) return ret
def get_toc(self): fp = open(self.pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') # title if doc.info: metadict = doc.info[0] if 'Title' in metadict.keys(): self.title = normalize_title(metadict['Title']) # level 1 of toc try: outlines = doc.get_outlines() toc = list() select_level = self.get_level1(outlines) except: return None for (level,title,dest,a,se) in doc.get_outlines(): if level==select_level: toc.append(normalize_toc_item(title)) return toc
class PdfSerializer(object): def __init__(self, filename): self.__filename = filename fp = open(self.__filename, 'rb') parser = PDFParser(fp) self.__doc = PDFDocument() parser.set_document(self.__doc) self.__doc.set_parser(parser) self.__doc.initialize('') def writeToTxt(self): text = self.getString() txtFile = open(self.__filename.replace(".pdf", ".txt"), "w") txtFile.write(text.encode('ascii','replace').decode("utf-8")) txtFile.close() def getString(self): rsrcmgr = PDFResourceManager() laparams = LAParams() string = StringIO() device = TextConverter(rsrcmgr, string, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self.__doc.get_pages(): interpreter.process_page(page) return string.getvalue()
def extract_text_elements_from_pdf(path, j=nulljob): """Opens a PDF and extract every element that is text based (LTText). """ fp = open(path, 'rb') doc = PDFDocument(caching=True) parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] all_elements = [] enumerated_pages = list(enumerate(doc.get_pages())) progress_msg = "Reading page %i of %i" for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg): interpreter.process_page(page) page_layout = device.get_result() pages.append(Page(page_layout.width, page_layout.height)) textboxes = extract_textboxes(page_layout) elements = [create_element(box) for box in textboxes] merge_oneletter_elems(elements) for i, elem in enumerate(elements): elem.page = pageno elem.order = i all_elements += elements return pages, all_elements
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) if isinstance(obj, PDFStream) and codec == 'raw': outfp.write(obj.get_rawdata()) elif isinstance(obj, PDFStream) and codec == 'binary': outfp.write(obj.get_data()) else: dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
class PDFController(object): def __init__(self, fd=None, password=''): self.fd = fd self.password = password self.parsed = False self.document = PDFDocument() self.laparams = LAParams() self.rsrcmgr = PDFResourceManager() self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams) self.layout = [] if fd: self.open(fd, password) def open(self, fd, password=''): self.password = password self.fd = fd if hasattr(fd, 'read') else open(fd) def close(self): if self.fd: self.fd.close() self.fd = None self.parsed = False def parse(self): parser = PDFParser(self.fd) parser.set_document(self.document) self.document.set_parser(parser) self.document.initialize(self.password) if not self.document.is_extractable: self.fd.close() raise PDFTextExtractionNotAllowed if not self.layout: self.layout = self._get_layout() self.parsed = True def _get_layout(self): layout = [] interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in self.document.get_pages(): interpreter.process_page(page) layout = self.device.get_result() return layout def lookup_term(self, term, ignore_case=True): layout_list = list(self.layout) indexes = [ i for i, v in enumerate(layout_list) if hasattr(v, 'get_text') and (term.lower() if ignore_case else term) in (v.get_text().lower() if ignore_case else v.get_text()) ] return indexes def __del__(self): self.fd.close() def __repr__(self): return '<PDFController> %s, %s' % ('Open file "%s"' % self.fd.name if self.fd else 'No file opened', 'not parsed' if not self.parsed else 'parsed')
def pdf2csv(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pageno, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #import code; code.interact(local=locals()); hlines=[] vlines=[] for i in layout: if not type(i) in (LTRect, LTLine): continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) hlines=filterclose(sorted(set(hlines))) vlines=filterclose(sorted(set(vlines))) print hlines print vlines print (layout.width, layout.height) i=0 im = Image.new('1', (int(layout.width), int(layout.height))) draw = ImageDraw.Draw(im) while(i<len(vlines)-1): if not vlines[i+1]-vlines[i]>5: i=i+1 continue j=0 while(j<len(hlines)-1): if not hlines[j+1]-hlines[j]>5: j=j+1 continue draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1) j=j+1 i=i+1 del draw fp=open("out%s.png" % pageno,'wb') im.save(fp,"PNG") fp.close()
def open_pdf(filename, password=''): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) return doc
def pdf2txt(pdf_file_name): # #open the pdf file in read bytes mode # try: fp = open(pdf_file_name , 'rb') except Exception as Argument: #log the error or warning in logfile logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox") logging.warning(traceback.format_exc()) return #create a parser object which is associated with the file object parser = PDFParser(fp) #create a PDFDocument objecct that stores the document strcuture doc = PDFDocument() #connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) #supply the password here, if the PDF is protected try: doc.initialize('') except Exception as Argument: #log the error or warning in logfile logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox") logging.warning(traceback.format_exc()) return # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams = laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #String to store the entire text textEtractedFromTable = "" # # The text extracted from the PDF file is returned to Main Table Module as a string # for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): textEtractedFromTable += (lt_obj.get_text()) return textEtractedFromTable
def GetTOC(self, doc, *args): fp = open(self.filepath, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) outlines = doc.get_outlines() return outlines
def parse(): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(r'./1.txt', 'a') as f: results = x.get_text() print(results) f.write(results + '\n')
def readPDF(path, topath): #以二进制形式打开PDF文件 f = open(path, "rb") #创建一个PDF文档分析器 parser = PDFParser(f) #创建一个PDF文档 pdfFile = PDFDocument() #连接分析器与文档对象 parser.set_document(pdfFile) #提供初始化密码 pdfFile.initialize() #检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed #不能转换,结束 else: #解析数据 #数据管理器 manager = PDFResourceManger() #创建一个PDF设备的对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) #解释器对象 interpreter = PDFPageInterpreter(manager, device) #开始循环处理,每次处理一页 for page in pdfFile.get_pages(): interpreter.progcess_page(page) #处理图层 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): #处理每行数据 with open(topath, "a") as f: str = x.get_text() #print(str) f.write(str + "\n")
def pdfparse(url, name): res = s.get(url, headers={"user-agent": generate_user_agent()}) path1 = os.getcwd() + "\\%s.pdf" % name.split(".")[0] # path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0] with open(path1, 'wb') as f: f.write(res.content) f = open(path1, 'rb') praser = PDFParser(f) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) f.close() doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) text = '' # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout)) #print(text) # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: results = x.get_text() if results: text = text + results.strip('\n') return text
def readPDF(self, path, callback=None, toPath=''): f = open(path, 'rb') # 以二进制可读形式打开pdf文件,'rb' parser = PDFParser(f) # 创建一个pdf文档分析器 pdfFile = PDFDocument() # 创建pdf文档 parser.set_document(pdfFile) # 链接文档对象与分析器 pdfFile.set_parser(parser) # 链接分析器与文档对象 pdfFile.initialize('') # 提供初始化密码 # 检测文档是否提供txt转换 if not pdfFile.is_extractable: # raise PDFTextExtractionNotAllowed else: # 解析数据 # #数据管理器 manager = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 创建解释器对象 interpreter = PDFPageInterpreter(manager, device) # 开始循环处理,每次处理一页,只能把文本读出来,图片读不出 for page in pdfFile.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: # 循环处理图层 if isinstance(x, LTTextBoxHorizontal ): # 判断图层类型为LTTextBoxHorizontal才可以进行读取 if toPath == '': #处理每行数据 str = x.get_text() if callback != None: callback(str) else: print(str) else: #写文件 print('将PDF数据写入文件')
def noimgpdf_change_word(self, _path): """ 没有图片的pdf文件转word :param _path: pdf文件路径 :return: """ try: if 'http://www' in _path: re = Request( url=_path, headers={'User-Agent': random.choice(self.user_agent)}) fp = urlopen(re) # 打开在线PDF文档 else: fp = open(_path, 'rb') # 打开本地pdf文档 praser_pdf = PDFParser(fp) doc = PDFDocument() praser_pdf.set_document(doc) doc.set_parser(praser_pdf) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) all_results = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for out in layout: if isinstance(out, LTTextBoxHorizontal): results = out.get_text() all_results += results return all_results except: return None
def process(path): aud = cur = dat = gen = genlong = geo = nam = 0 fp = open(path, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() fp.close() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text().lower() list = results.split() for part in list: aud += count_word(part, auditor) cur += count_word(part, currency) dat += count_word(part, datesand) gen += count_word(part, generic) genlong += count_word(part, genericlong) geo += count_word(part, geographic) nam += count_word(part, names) return [aud, cur, dat, gen, genlong, geo, nam]
def parsePDF(pdfPath, pdfPwd='', imgFolderPath='/tmp', saveImgs=False): """Process each of the pages in this pdf file and return a list of strings representing the text found in each page""" if not os.path.exists(imgFolderPath): os.makedirs(imgFolderPath) try: #打开pdf文档 fp = open(pdfPath, 'rb') #创建pdf解析器 parser = PDFParser(fp) #创建pdf存储器 doc = PDFDocument() #关联pdf解析器和存储器 parser.set_document(doc) doc.set_parser(parser) #初始化 doc.initialize(pdfPwd) if doc.is_extractable: #处理pdf文档 text = parsePages(doc, imgFolderPath, saveImgs=saveImgs) with open('{0}/text.txt'.format(imgFolderPath), 'w', encoding='utf-8') as f: for line in text: f.write(line) f.close() # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass
def parse(pdf_path, txt_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: print('走了') raise PDFTextExtractionNotAllowed else: mgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(mgr, laparams=laparams) interpreter = PDFPageInterpreter(mgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(txt_path, 'a') as f: results = x.get_text() print(results) f.write(results + "\n")
def parse(path): tmp = file[i].split('/') #The output files will be kept in "res" directory,so please make a "res" directory before run this program res_name = './res/' + os.path.splitext(tmp[len(tmp) - 1])[0] + '.txt' fp = open(path, 'rb') #Create a pdf parser praser = PDFParser(fp) # Create a pdf doc doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) # init doc.initialize() #Check whether the doc provides TXT conversion if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # Create pdf resource manager rsrcmgr = PDFResourceManager() # Create a pdf device object laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a pdf interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(res_name, 'a', encoding='utf-8') as f: results = x.get_text() print(results) f.write(results + '\n')
def parse(inpath, outpath): remove(TMPDIR) # 清除临时目录 os.mkdir(TMPDIR) remove(outpath) # 清除输出文件 fp = open(inpath, 'rb') praser = PDFParser(fp) # pdf文档分析器 doc = PDFDocument() # 创建一个PDF文档 praser.set_document(doc) # 连接分析器与文档对象 doc.set_parser(praser) doc.initialize() if not doc.is_extractable: # 是否提供txt转换 raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() # 创建PDF资源管理器 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象 for idx,page in enumerate(doc.get_pages()): # 获取page列表 interpreter.process_page(page) layout = device.get_result() print("parse", idx) parse_section(layout, outpath)
def pdf2txt(path): fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() ''' laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 ''' laparams = LAParams() for param in ( "all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: setattr(laparams, param, paramv) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() fp.close() return extracted_text
def extract_text_from_pdf(self): if (self.page_end == 0): self.page_end = self.page_beg fp = open(self.filepath_in + '/' + self.nom_fichier, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 4.0 # 2.0 by default : two char whose distance is closer than this value are considered contiguous and get grouped into one. laparams.word_margin = 0.3 # 0.1 by default : distance between two words is greater than this value => insert space laparams.line_margin = 0.5 # 0.5 by default : Distance between 2 Lines under this value are grouped device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' x = list(doc.get_pages()) for i in range(self.page_beg - 1, self.page_end): page = x[i] extracted_text += "EXTRACTION DE LA PAGE " + str(i + 1) + "\n\n" interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() extracted_text += "\n" return extracted_text
def parsePDF(pathPDF, pathText, fname): outfile = open(str(os.path.join(pathText, fname))[0:-4] + '.txt', 'w+', encoding='utf-8') fp = open(str(os.path.join(pathPDF, fname)), 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): #print(lt_obj.get_text()) outfile.write(lt_obj.get_text()) #outfile.write(lt_obj.get_text()) outfile.write ('=' * 100 + '\n')
def parsePDFByURLandTokenize_PDFMiner(url): file = urllib.request.urlopen(url).read() if file is not None: memory = io.BytesIO(file) parser = PDFParser(memory) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = pdfminer.layout.LAParams() #sets the layout analyzer params so we can extract the text with whitespaces for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: setattr(laparams, param, paramv) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return word_tokenize(extracted_text) return None
def Pdf_generation_TF(f, qaStatus=False): allSentances = [] num_words = 0 parser = PDFParser(f) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. NativeallWords = [] for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): num_words += len(lt_obj.get_text().split()) lt_obj.get_text().encode("utf8") rawSentance = str(lt_obj) # allSentances=sentanceGernaration(rawSentance) endPoint = rawSentance.rfind("\\n") - 1 rawSentance = rawSentance[55:endPoint].replace('\\n', '').replace( '\\s', '') allSentances.extend(rawSentance.split('.')) NativeallWords.extend(lt_obj.get_text().split()) # print(NativeallWords) # print('Sentances from pdf',allSentances) if qaStatus: print('from PDF ') return allSentances return calc_TF(NativeallWords, num_words)
def _make_pages(self, fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) document = PDFDocument() parser.set_document(document) # Create a PDF document object that stores the document structure. # Supply the password for initialization. password = "" document.set_parser(parser) document.initialize(password) # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = list(document.get_pages()) return (device, interpreter, pages, rsrcmgr)
def getTextFromFirstPage(filename): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' po = None for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() break fp.close() return extracted_text
def convert_pdf_2_text(path, name): parser = PDFParser(open(path + name, "rb")) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() if (not doc.is_extractable): raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(path + name[:-4] + ".txt", 'a') as f: results = x.get_text() print(results) f.write(results + "\n")
def parse(): fp = open(path, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open('/Users/liamtheron/Desktop/Deloiite/test.txt', 'a') as f: results = x.get_text() f.write(results) f.write('\n')
def parse(file_name, target_name): fp = open(file_name, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_number = 1 for page in doc.get_pages(): print('page: ' + str(page_number)) interpreter.process_page(page) layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性 for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(target_name, 'a') as f: results = x.get_text() translate_text = translate(results) f.write(translate_text + '\n') # if (isinstance(x, LTImage)): # with open('patternColoring.txt', 'a') as f: # results = x.get_image() # f.write('###########\n' + results + '\n') page_number += 1
def readPDF(path, topath): # 以二进制形式打开pdf文件 f = open(path, 'rb') # 创建pdf文档分析器 parser = PDFParser(f) # 创建pdf文档 pdfFile = PDFDocument() # 连接分析器与文档对象 parser.set_document(pdfFile) pdfFile.set_parser(parser) # 提供初始化密码 pdfFile.initialize() # 检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: # 解释数据 # 数据管理器 manager = PDFResourceManager() # 创建一个pdf设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 解释器对象 interpreter = PDFPageInterpreter(manager, device) # 开始循环处理,每次处理一页 for page in pdfFile.get_pages(): interpreter.process_page(page) # 获取图层 layout = device.get_result() for x in layout: # isinstance函数是用来判断一个对象的变量类型 if (isinstance(x, LTTextBoxHorizontal)): with open(toPath, 'a') as f: str1 = x.get_text() f.write(str1 + "\n")
def readPDF(path, toPath): # 以二进制形式打开pdf文件 with open(path, "rb") as f: # 创建一个pdf文档分析器 parser = PDFParser(f) # 创建pdf文档 pdfFile = PDFDocument() # 链接分析器与文档对象 parser.set_document(pdfFile) pdfFile.set_parser(parser) # 提供初始化密码 pdfFile.initialize() # 检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: # 解析数据 # 数据管理 manager = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 解释器对象 interpreter = PDFPageInterpreter(manager, device) # 开始循环处理,每次处理一页 for page in pdfFile.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if(isinstance(x, LTTextBoxHorizontal)): with open(toPath, 'a',encoding='utf-8') as f: str = x.get_text() # print(str) f.write(str+"\n")
def parse(read_path): fp = open(read_path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) return doc,interpreter,device
def pdf_to_string(pdf_file): fp = open(pdf_file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 0.3 laparams.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, (LTTextBox, LTTextLine)): extracted_text += lt_obj.get_text() return extracted_text
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages())) for (level, title, dest, a, se) in doc.get_outlines(): pageno = None if dest: dest = resolve1(doc.lookup_name('Dests', dest)) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] outfp.write(repr((level, title, dest, pageno)) + '\n') parser.close() fp.close() return
def parse_pdf(path): fp = open(path, 'rb') # 以二进制读模式打开 praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() if results[:5] == "准考证号:": return results[5:].replace("\n", "")
def load_file_text(self, import_file): """ Import individual file types of odt, docx txt, pdf, html, htm """ text = "" # Import from odt if import_file[-4:].lower() == ".odt": text = self.convert_odt_to_text(import_file) # Import from docx if import_file[-5:].lower() == ".docx": #text = convert(importFile) # uses docx_to_html document = opendocx(import_file) list_ = getdocumenttext(document) text = "\n".join(list_) # Import from epub if import_file[-5:].lower() == ".epub": book = epub.read_epub(import_file) for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): #print(d.get_content()) bytes_ = d.get_body_content() string = bytes_.decode('utf-8') text += html_to_text(string) + "\n" # import PDF if import_file[-4:].lower() == '.pdf': fp = open(import_file, 'rb') # read binary mode parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) # potential error with encrypted PDF doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): text += lt_obj.get_text() # import from html if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm": importErrors = 0 with open(import_file, "r") as sourcefile: fileText = "" while 1: line = sourcefile.readline() if not line: break fileText += line text = html_to_text(fileText) QtWidgets.QMessageBox.warning(None, 'Warning', str(importErrors) + " lines not imported") # Try importing as a plain text file. if text == "": import_errors = 0 try: with open(import_file, "r") as sourcefile: while 1: line = sourcefile.readline() if not line: break try: text += line except Exception as e: #logger.debug("Importing plain text file, line ignored: " + str(e)) import_errors += 1 if text[0:6] == "\ufeff": # associated with notepad files text = text[6:] except Exception as e: QtWidgets.QMessageBox.warning(None, 'Warning', "Cannot import " + str(import_file) + "\n" + str(e)) return if import_errors > 0: QtWidgets.QMessageBox.warning(None, 'Warning', str(import_errors) + " lines not imported") logger.warning(import_file + ": " + str(import_errors) + " lines not imported") # import of text file did not work if text == "": QtWidgets.QMessageBox.warning(None, 'Warning', "Cannot import " + str(import_file) + "\n" + str(e)) return # Final checks: check for duplicated filename and update model, widget and database nameSplit = import_file.split("/") filename = nameSplit[-1] if any(d['name'] == filename for d in self.source): QtWidgets.QMessageBox.warning(None, 'Duplicate file', "Duplicate filename.\nFile not imported") return entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "", 'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} cur = self.settings['conn'].cursor() #logger.debug("type fulltext: " + str(type(entry['fulltext']))) cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)", (entry['name'], entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date'])) self.settings['conn'].commit() cur.execute("select last_insert_rowid()") id_ = cur.fetchone()[0] entry['id'] = id_ self.parent_textEdit.append(entry['name'] + " imported.") self.source.append(entry)
''' 使用pdfminer3k读取pdf文档 ''' import os from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams IN_PUT_PATH = '肖申克的救赎.pdf' OUT_PUT = '肖申克的救赎.txt' fp = open(IN_PUT_PATH, 'rb') # 打开一个pdf文档 parser = PDFParser(fp) # 创建一个与pdf文档关联的解析器对象 doc = PDFDocument() # 创建一个pdf文档对象, 用于存储文档结构 parser.set_document(doc) # 将解析器与文档对象关联 doc.set_parser(parser) doc.initialize('') # 初始化文档 resource = PDFResourceManager() # 创建一个pdf资源管理器对象,用于存储共享资源 laparam = LAParams() # 参数分析器 device = PDFPageAggregator(resource, laparams=laparam) # 创建pdf页面聚合器对象 interpreter = PDFPageInterpreter(resource, device) # 创建pdf解释器对象 for page in doc.get_pages(): # 使用文档对象得到页面的集合 interpreter.process_page(page) # 使用页面解析器读取内容 layout = device.get_result() # 使用聚合器来获取内容 for out in layout: if hasattr(out, 'get_text'): print(out.get_text()) f = open(OUT_PUT, 'a+') f.write(out.get_text())
def parsePdf(fp): '''解析PDF文本,并保存到TXT文件中''' # text_path = "./file02.pdf" # r = preview_pdf(s, contractNo=contractNo) # # try: # if r.status_code == 404 : # print("404") # return "文件格式为非pdf" # elif r.status_code == 500: # print("500") # return "文件格式同非pdf格式" # # except Exception as message: # # with open(text_path, mode="wb+") as f: # f.write(r.content) # f.close() # # return text_path # fp = open(real_path, 'rb') # 用文件对象创建一个PDF文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器,与文档对象 parser.set_document(doc) doc.set_parser(parser) #提供初始化密码,如果没有密码,就创建一个空的字符串 doc.initialize() #检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) #创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr, device) #循环遍历列表,每次处理一个page内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): interpreter.process_page(page) #接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): # with open(r'2.txt','a') as f: results = x.get_text() print(results) # f.write(results +"\n") print(type(results)) print("ces") return results
def read_pdf(resume_file_path, txt_output_path): resume_file = open(resume_file_path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(resume_file) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 resource_magager = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(resource_magager, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(resource_magager, device) # 如果写入文件存在,则清空文件或者删除文件 if (os.path.exists(txt_output_path)): os.remove(txt_output_path) print('exist and remove') # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 try: interpreter.process_page(page) except KeyError: error_info_string = str(sys.exc_info()) print(resume_file_path + ":" + error_info_string) read_failed_logs(resume_file_path) break # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(txt_output_path, 'a') as f: # , encoding='utf-8' results = x.get_text() all_lines_string = bad_code_collection_read() results = bad_code_clean(all_lines_string, results) results = space_process(results) results = recursive_process_UnicodeEncodeError( f, results) print(results) resume_file.close()