def getPageLayouts(f1): '''Takes a pdf file object, f1, extracts the text-like objects, and returns''' try: '''The parser and doc pair for a "pipe" of sorts''' with open(fpath, 'rb') as f1: parser = PDFParser(f1) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pss_wd) # can we extract text? if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_layouts = [] for page in doc.get_pages(): ''' I *think* we're actually calling on fp here, and not some stored data; the idea is that .pdf files are "too big and complicated" to load all at once, so why not just parse what you need when you need it? ''' interpreter.process_page(page) # receive the LTPage object for the page page_layouts.append(device.get_result()) except IOError: raise IOError, "issue with loading file, please try again" finally: f1.close() return page_layouts
class Pdf(object): def __init__(self, pdf_file): parser = PDFParser(pdf_file) self._doc = PDFDocument() parser.set_document(self._doc) self._doc.initialize self._doc.set_parser(parser) @property def pages(self): return len(tuple(self._doc.get_pages())) def to_text(self): rsrcmgr = PDFResourceManager() output = StringIO() laparams = LAParams() laparams.detect_vertical = True laparams.all_texts = True laparams.word_margin = 0.4 device = TextConverter(rsrcmgr, output, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self._doc.get_pages(): interpreter.process_page(page) return output.getvalue().decode('utf-8', 'ignore')
def pdf_to_text(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter #<-- changed from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter rsrc = PDFResourceManager() outfp = StringIO() device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) print "There are: " + str(len(list(doc.get_pages()))) + " pages" for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) if isinstance(obj, PDFStream) and codec == 'raw': outfp.write(obj.get_rawdata()) elif isinstance(obj, PDFStream) and codec == 'binary': outfp.write(obj.get_data()) else: dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def load( self, open_file ): self.fields = {} self.text= {} # Create a PDF parser object associated with the file object. parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for pgnum, page in enumerate( doc.get_pages() ): interpreter.process_page(page) if page.annots: self._build_annotations( page ) txt= self._get_text( device ) self.text[pgnum+1]= txt
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def extractContent(file): print "extractContent" fp = open(file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) rsrcmgr = PDFResourceManager() codec = 'UTF-8' laparams = LAParams() outfp = StringIO.StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) #if not doc.is_extractable: # return None for i, page in enumerate(doc.get_pages()): print "page=" + str(i) if page is not None: interpreter.process_page(page) print "EOF" device.close() fp.close() return outfp.getvalue()
class PdfSerializer(object): def __init__(self, filename): self.__filename = filename fp = open(self.__filename, 'rb') parser = PDFParser(fp) self.__doc = PDFDocument() parser.set_document(self.__doc) self.__doc.set_parser(parser) self.__doc.initialize('') def writeToTxt(self): text = self.getString() txtFile = open(self.__filename.replace(".pdf", ".txt"), "w") txtFile.write(text.encode('ascii','replace').decode("utf-8")) txtFile.close() def getString(self): rsrcmgr = PDFResourceManager() laparams = LAParams() string = StringIO() device = TextConverter(rsrcmgr, string, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self.__doc.get_pages(): interpreter.process_page(page) return string.getvalue()
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
def ParseAllPages(self, filepath): # Open a PDF file. self.filepath = filepath fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page)
def parse_pdf(pdf_url): remote_file = urllib.request.urlopen(pdf_url).read() memory_file = io.BytesIO(remote_file) parser = PDFParser(memory_file) doc = PDFDocument() parser.set_document(doc) #Warning sometimes, error in pdf? doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ret = [] # Process each page contained in the document. for pageIdx, page in enumerate(doc.get_pages()): ret.append([]) interpreter.process_page(page) layout = device.get_result() for idx, lt_obj in enumerate(layout): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if len(lt_obj.get_text().strip()) > 0: ret[pageIdx].append((lt_obj.get_text().splitlines())) return ret
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def pdf_to_csv(filename): # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def read_invoice_pdfminer3k(pdfFile): fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. invoice_text = "" for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): invoice_text += lt_obj.get_text() # Extract client info from the string extracted from pdf client = extract_info(invoice_text, client_start, client_end) print("client :" + client) # Extract invoice no from the pdf file name invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end) print("invoice no :" + invoice_no) # Pass the client info and invoice no to the method which writes to excel file write_excel(client, invoice_no)
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 parser= PDFParser(f) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) for page in doc.get_pages(): retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_num += 1 interpreter.process_page(page) data = retstr.getvalue() self.parse_page(fpath, bytes(data,'UTF-8'), page_num) retstr.close() self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def extract_text_elements_from_pdf(path, j=nulljob): """Opens a PDF and extract every element that is text based (LTText). """ fp = open(path, 'rb') doc = PDFDocument(caching=True) parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] all_elements = [] enumerated_pages = list(enumerate(doc.get_pages())) progress_msg = "Reading page %i of %i" for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg): interpreter.process_page(page) page_layout = device.get_result() pages.append(Page(page_layout.width, page_layout.height)) textboxes = extract_textboxes(page_layout) elements = [create_element(box) for box in textboxes] merge_oneletter_elems(elements) for i, elem in enumerate(elements): elem.page = pageno elem.order = i all_elements += elements return pages, all_elements
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages = dict(enumerate(doc.get_pages())) for num, page in pages.iteritems(): if pagenums and (num not in pagenums): continue interpreter.process_page(page) if maxpages and maxpages <= num + 1: break return pages
class PDFController(object): def __init__(self, fd=None, password=''): self.fd = fd self.password = password self.parsed = False self.document = PDFDocument() self.laparams = LAParams() self.rsrcmgr = PDFResourceManager() self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams) self.layout = [] if fd: self.open(fd, password) def open(self, fd, password=''): self.password = password self.fd = fd if hasattr(fd, 'read') else open(fd) def close(self): if self.fd: self.fd.close() self.fd = None self.parsed = False def parse(self): parser = PDFParser(self.fd) parser.set_document(self.document) self.document.set_parser(parser) self.document.initialize(self.password) if not self.document.is_extractable: self.fd.close() raise PDFTextExtractionNotAllowed if not self.layout: self.layout = self._get_layout() self.parsed = True def _get_layout(self): layout = [] interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) for page in self.document.get_pages(): interpreter.process_page(page) layout = self.device.get_result() return layout def lookup_term(self, term, ignore_case=True): layout_list = list(self.layout) indexes = [ i for i, v in enumerate(layout_list) if hasattr(v, 'get_text') and (term.lower() if ignore_case else term) in (v.get_text().lower() if ignore_case else v.get_text()) ] return indexes def __del__(self): self.fd.close() def __repr__(self): return '<PDFController> %s, %s' % ('Open file "%s"' % self.fd.name if self.fd else 'No file opened', 'not parsed' if not self.parsed else 'parsed')
def pdf2csv(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pageno, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #import code; code.interact(local=locals()); hlines=[] vlines=[] for i in layout: if not type(i) in (LTRect, LTLine): continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) hlines=filterclose(sorted(set(hlines))) vlines=filterclose(sorted(set(vlines))) print hlines print vlines print (layout.width, layout.height) i=0 im = Image.new('1', (int(layout.width), int(layout.height))) draw = ImageDraw.Draw(im) while(i<len(vlines)-1): if not vlines[i+1]-vlines[i]>5: i=i+1 continue j=0 while(j<len(hlines)-1): if not hlines[j+1]-hlines[j]>5: j=j+1 continue draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1) j=j+1 i=i+1 del draw fp=open("out%s.png" % pageno,'wb') im.save(fp,"PNG") fp.close()
def pdf2txt(pdf_file_name): # #open the pdf file in read bytes mode # try: fp = open(pdf_file_name , 'rb') except Exception as Argument: #log the error or warning in logfile logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox") logging.warning(traceback.format_exc()) return #create a parser object which is associated with the file object parser = PDFParser(fp) #create a PDFDocument objecct that stores the document strcuture doc = PDFDocument() #connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) #supply the password here, if the PDF is protected try: doc.initialize('') except Exception as Argument: #log the error or warning in logfile logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox") logging.warning(traceback.format_exc()) return # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams = laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #String to store the entire text textEtractedFromTable = "" # # The text extracted from the PDF file is returned to Main Table Module as a string # for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): textEtractedFromTable += (lt_obj.get_text()) return textEtractedFromTable
def pdf2csv(pdf): fp = open(pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) writer = UnicodeWriter(sys.stdout) for pageno, page in enumerate(doc.get_pages()): print "traitement page", pageno interpreter.process_page(page) layout = device.get_result() hlines = [] vlines = [] print layout for i in layout: if not type(i) == LTRect: continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) print hlines print vlines hlines = filterclose(sorted(set(hlines))) vlines = filterclose(sorted(set(vlines))) i = 0 while(i < len(vlines) - 1): if not vlines[i + 1] - vlines[i] > 10: i = i + 1 continue j = 0 row = [] while(j < len(hlines) - 1): if not hlines[j + 1] - hlines[j] > 10: j = j + 1 continue row.append(' '.join(get_region(pdf, pageno + 1, hlines[j] + 1, vlines[i], hlines[j + 1] - 1, vlines[i + 1]).split())) j = j + 1 writer.writerow(row) i = i + 1 fp.close()
def parse_pdf (self): self.report = Report (self.logger) fp = StringIO(self.raw_pdf) parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) try: doc.set_parser(parser) doc.initialize('') if not doc.is_extractable: raise RuntimeError("PDFTextExtractionNotAllowed") rsrcmgr = PDFResourceManager() laparams = LAParams( char_margin=0.01, # default 1.0 word_margin=0.2, # default 0.2 line_margin=0.3, # default 0.3 line_overlap=0.5 # default 0.5 ) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_y_offset = 0 pages = [] for page in doc.get_pages(): pages.append(page) pages.reverse() # Detect pdf format for page in pages: interpreter.process_page(page) layout = device.get_result() if self.find_pdf_text(layout, "Job Start Date:"): sheet = Sheet2() break else: sheet = Sheet1() #sheet = Sheet2() # TESTING for page in pages: interpreter.process_page(page) layout = device.get_result() sheet.add_ltcontainer (layout, page_y_offset) page_y_offset += layout.y1 self.report.extract_data (sheet) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() trace = traceback.format_exception (exc_type, exc_value, exc_traceback) self.logger.error('%s'%''.join(trace)) if self.logger.has_error(): return None else: return self.report
class PDFScraper(object): """ """ converterClass = TabbedConverter def __init__(self, filename, skipStartsWith=None, skipIn=None): self.filename = filename rsrc = PDFResourceManager() self.outfp = StringIO() self.converter = self.converterClass( rsrc, self.outfp, codec="utf-8", laparams=LAParams(), skip_startswith=skipStartsWith or [], skip_in=skipIn or [], isLineStart=self.isLineStart, cleanTerm=self.cleanTerm, preProcessLine=self.preProcessLine, ) self.interpreter = PDFPageInterpreter(rsrc, self.converter) def isLineStart(self, line): return False def cleanTerm(self, line): return line def preProcessLine(self, line): return line def prepare(self): self.doc = PDFDocument() self.source = open(self.filename, "rb") parser = PDFParser(self.source) parser.set_document(self.doc) self.doc.set_parser(parser) self.doc.initialize("") def finish(self): self.converter.close() self.source.close() def postProcess(self): return self.outfp.getvalue() def run(self): self.prepare() # for i, page in enumerate(list(self.doc.get_pages())[0:1]): for i, page in enumerate(self.doc.get_pages()): if page is not None: self.interpreter.process_page(page) self.finish() return self.postProcess()
def pdf_page_count(filelike): ''' returns number of pages of an pdf document ''' filelike.seek(0) parser = PDFParser(filelike) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') pages = sum(1 for _ in doc.get_pages()) filelike.seek(0) return pages
class PDF(list): def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1): self.parser = PDFParser(file) self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) if PYTHON_3: self.doc = PDFDocument() self.parser.set_document(self.doc) self.doc.set_parser(self.parser) self.doc.initialize(password) else: self.doc = PDFDocument(self.parser, password) if not check_extractable or self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams) self.interpreter = PDFPageInterpreter( self.resmgr, self.device) if PYTHON_3: page_generator = self.doc.get_pages() else: page_generator = PDFPage.create_pages(self.doc) for page in page_generator: self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup() def _cleanup(self): """ Frees lots of non-textual information, such as the fonts and images and the objects that were needed to parse the PDF. """ self.device = None self.doc = None self.parser = None self.resmgr = None self.interpreter = None def text(self, clean=True): """ Returns the text of the PDF as a single string. Options: :clean: Removes misc cruft, like lots of whitespace. """ if clean: return utils.normalise_whitespace(''.join(self).replace('\n', ' ')) else: return ''.join(self)
def getTableOfContents (path, pageNum): fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) #doc.initialize(password) for pageNumber, page in enumerate(doc.get_pages()): if pageNumber == pageNum: return getParsedPage(doc, pageNum)
def pdf_to_text(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class Converter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item._objs: #<-- changed if isinstance(child, LTChar): (_,_,x,y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] a = "".join(line[x] for x in sorted(line.keys())) self.outfp.write("".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") return a # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = Converter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): #outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) #outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def get_pdf_num_page(self, pdf): """ Get count page """ if os.path.exists(pdf): fp = open(pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) count_page = [i for i in doc.get_pages()] return len(count_page)
def data_extraction(filename): from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBoxHorizontal doc = PDFDocument() parser = PDFParser(open(filename + '.pdf', 'rb')) parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) data_cols = {} date, day, red_line_people, orange_line_people, total_people = [], [], [], [], [] for i, page in enumerate(doc.get_pages()): interpreter.process_page(page) layout = device.get_result() for x in layout: if type(x) == LTTextBoxHorizontal: x = re.sub(r'\n\s*\n', '\n' , x.get_text()).strip() first_value = str(x.split('\n')[0]).strip() if first_value == '營運日': date = x.split('\n') # print '營運日', date if first_value == '星期': day = x.split('\n') # print '星期', day if first_value == '紅線運量(人次)': red_line_people = [v.strip() for v in x.replace(',','').split('\n')] # print '紅線運量(人次)', red_line_people if first_value == '橘線運量(人次)': orange_line_people = [v.strip() for v in x.replace(',','').split('\n')] # print '橘線運量(人次)', orange_line_people if first_value == '總運量(人次)': total_people = [v.strip() for v in x.replace(',','').split('\n')] # print '總運量(人次)', total_people data_cols = {'date': date, 'day': day, 'red_line_people': red_line_people, 'orange_line_people': orange_line_people, 'total_people': total_people} return data_cols
def parseCRF(strFilePath, strFileName, boolControlVersion, strVersion): # print("Start parse CRF"); # fp = open(strSysPath+"/CRF/"+strFileName,'rb'); fp = open(strFilePath + "\\" + strFileName, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) count = 0 listCRF = list() for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() # strForm=""; objPageQuestions = createPageQuestions(layout, count + 1, boolControlVersion, strVersion) listCRF.extend(objPageQuestions) count = count + 1 # if count >10: # break; df = pd.DataFrame(listCRF) # print("End parse CRF"); return df
class PDF2Word: def __init__(self, pdf_path): # 以二进制读模式打开 PDF fp = open(pdf_path, 'rb') # 用文件对象来创建一个 PDF 文档分析器 parser = PDFParser(fp) # 创建一个 PDF 文档 self.doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(self.doc) self.doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 self.doc.initialize() # PDF 转 Word def pdf_to_word(self, sve_path): # 检测文档是否提供 txt 转换,不提供就忽略 if not self.doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建 PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个 PDF 设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个 PDF 解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 # 首先创建一个文档对象 document = Document() # 循环遍历列表,每次处理一个page的内容 for page in self.doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 results = x.get_text() document.add_paragraph(results) document.save(sve_path)
def process(path): nega = posi = unce = liti = cons = supe = inte = master = total = 0 fp = open(path, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() fp.close() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text().lower() list = results.split() total += len(results) for part in list: if count_word(part, word) > 0 : master += 1 if count_word(part, negative ): nega += 1 if count_word(part, positive): posi += 1 if count_word(part, uncertainty): unce += 1 if count_word(part, litigious): liti += 1 if count_word(part, constraining): cons += 1 if count_word(part, superfluous): supe += 1 if count_word(part, interesting): inte += 1 return [master, total, nega, posi, unce, liti, cons, supe, inte]
def readPdf(path, toPath): # 以二进制打开文件 pdf f = open(path, "rb") # 创建一个pdf文档分析器 parser = PDFParser(f) # 创建一个pdf文档 pdfFile = PDFDocument() # 连接文档和分析器 parser.set_document(pdfFile) pdfFile.set_parser(parser) # 提供初始化密码 pdfFile.initialize() # 检测pdf文档是否提供txt转换 if not pdfFile.is_extractable: # 不提供 # 当程序出现错误,python会自动引发异常,也可以通过raise显示地引发异常。 # 一旦执行了raise语句,raise后面的语句将不能执行 raise PDFTextExtractionNotAllowed else: # 提供 manager = PDFResourceManager() # 创建一个pdf设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 解锁器对象 interpreter = PDFPageInterpreter(manager, device) # 开始循环处理 每次处理一页 for page in pdfFile.get_pages(): # 解释这一页 interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): # 打开 toPath文件 并 往里面追加 pdf的内容 with open(toPath, "a", encoding="utf-8") as f: str = x.get_text() print(str) f.write(str + "\n")
def parse(): #rb以二进制读模式打开本地pdf文件 fn = open('test.pdf', 'rb') #创建一个pdf文档分析器 parser = PDFParser(fn) #创建一个PDF文档 doc = PDFDocument(parser) #连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDf资源管理器 resource = PDFResourceManager() #创建一个PDF参数分析器 laparams = LAParams() #创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) #创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): #利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) #使用聚合器get_result()方法获取内容 layout = device.get_result() #这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 try: for out in layout: #判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): print(out.get_text()) with open('test.txt', 'a') as f: f.write(out.get_text() + '\n') except UnicodeEncodeError as ue: print("异常:" + str(ue)) pass
def get_text_from_pdf(filename): from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfinterp import PDFTextExtractionNotAllowed path = filename + ".pdf" # 用文件对象来创建一个pdf文档分析器 praser = PDFParser(open(path, 'rb')) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 content = '' for page in doc.get_pages(): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个 page 解析出的各种对象 # 包括 LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等 for x in layout: if isinstance(x, LTTextBox): # print(x.get_text().strip()) content = content + x.get_text().strip() return content
def readPDF(path): # 以二进制形式打开pdf文件 f = open(path, "rb") # 创建一个pdf文档分析器 parser = PDFParser(f) #创建一个pdf文档 pdfFile = PDFDocument() #连接分析器和文档对象 parser.set_document(pdfFile) pdfFile.set_parser(parser) #提供初始化密码 pdfFile.initialize() #检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: #解析数据 #数据管理器 manager = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 创建一个PDF解释其对象 interpreter = PDFPageInterpreter(manager, device) # 循环遍历列表,每次处理一个page内容 # pdfFile.get_pages() 获取page列表 for page in pdfFile.get_pages(): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: #判断类型isinstance()x是LTTextBoxHorizontal类型 if (isinstance(x, LTTextBoxHorizontal)): with open(r'pdf.txt', 'a',encoding='utf-8') as f: str = x.get_text() # print(str) f.write(str + "\n")
def pdfparse(self, url): try: if url: res = s.get(url, headers={"user-agent": generate_user_agent()}) res.encoding = 'utf-8' f = BytesIO() f.write(res.content) f.seek(0) # path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0] # print(path1) praser = PDFParser(f) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 # print("a") rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) text = '' # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout)) #print(text) # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if hasattr(x, 'get_text'): results = x.get_text() if results: text += results.strip('\n') f.close() return text except Exception as e: print(e)
def pdf_to_csv(filename): class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda: {}) for child in self.cur_item._objs: #<-- changed if isinstance(child, LTChar): (_, _, x, y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def parse(): # rb以二进制读模式打开本地pdf文件 fn = open('Django-日志配置.md.pdf','rb') # 创建一个pdf文档分析器 parser = PDFParser(fn) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 resource = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource,laparams=laparams) # 创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource,device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) # 使用聚合器get_result()方法获取内容 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: # 判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out,"get_text"): # print(out.get_text(), type(out.get_text())) content = out.get_text().replace(u'\xa0', u' ') # 将'\xa0'替换成u' '空格,这个\xa0就是&nbps空格 # with open('test.txt','a') as f: # f.write(out.get_text().replace(u'\xa0', u' ')+'\n') document.add_paragraph( content, style='ListBullet' # 添加段落,样式为unordered list类型 ) document.save('demo1.docx') # 保存这个文档
def process_pdf(filePath): # 二进制读取pdf文件 fp = open(filePath, 'rb') parser = PDFParser(fp) # 创建一个PDF文档对象 doc = PDFDocument() # 分析器和文档相互连接 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device) result = '' # 循环遍历列表,每次只处理一个page内容 for page in doc.get_pages(): # doc.get_pages()获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): # with open('test.txt', 'a') as f: # result = x.get_text() # print(result) # f.write(result + '\n') result += x.get_text() try: invoice_no = re.search("INVOICE NO. (\d+)", result) return invoice_no.group(1) except: return None
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = open(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level,title,dest,a,se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close()
def pages_from_pdf(path, **laparams): fp = open(path, 'rb') doc = PDFDocument(caching=True) parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, **laparams) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) result = [] for page in doc.get_pages(): interpreter.process_page(page) page_layout = device.get_result() result.append(page_layout) return result
def pdf2txt(pdfname, txtname): btxt = False try: fp = open(pdfname, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ncount = 0 print("pdf2txt %s..." % pdfname) # informa por consola del nombre de archivo # abre archivo de texto para la salida fptxt = open(txtname, 'w') # recorre el documento procesando cada página for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() # recorre la página procesando cada objeto for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): spagetxt = lt_obj.get_text().strip() + " " if (spagetxt != ""): btxt = True fptxt.write(spagetxt) print("Palabra", spagetxt) elif isinstance(lt_obj, LTFigure): print("LTFigure, pte implementar!") spagetxt = "" ncount += 1 print("end") fptxt.closed fp.closed except Exception as e: print("Error: %s" % (e)) return btxt
def parser_pdf_file(pdf_file_path): d_count = 0 x_count = 0 word_count = 0 read_pdf = open(pdf_file_path, 'rb') # 打开PDF文件。 parser_pdf = PDFParser(read_pdf) # 用文件对象创建一个PDF文档分析器。 pdf_document = PDFDocument(parser_pdf) # 创建一个PDF文档。 parser_pdf.set_document(pdf_document) pdf_document.set_parser(parser_pdf) # 连接分析器 与文档对象。 pdf_document.initialize() # 如果没有密码,就创建一个空的字符串。 if not pdf_document.is_extractable: # 检测文档是否提供txt转换,不提供就忽略。 raise PDFTextExtractionNotAllowed else: pdf_manager = PDFResourceManager() # 创建PDF资源管理器 来管理共享资源。 pdf_laparams = LAParams() # 创建一个PDF参数分析器。 pdf_device = PDFPageAggregator(pdf_manager, laparams=pdf_laparams) # 创建一个聚合器 pdf_interpreter = PDFPageInterpreter(pdf_manager, pdf_device) # 创建一个PDF页面解释器对象 # 循环遍历列表,每次处理一页的内容,pdf_document.get_pages()获取page列表 for each_page in pdf_document.get_pages(): pdf_interpreter.process_page(each_page) # 使用页面解释器来读取 layout = pdf_device.get_result( ) # 这里layout是一个LTPage对象 里面存放着这个page解析出的各种对象 一般包括LTTexBox,LTFigure,LTImage, # LTTexBoxHorizontal等等 想要获取文本就获得对象的text属性。 # print(layout) for each_info in layout: if isinstance(each_info, LTTextBoxHorizontal): result = each_info.get_text().strip() d_match = d_pattern.findall(result) x_match = x_pattern.findall(result) word_count += len(result) if d_match: d_count += 1 if x_match: x_count += 1 print(result) print("======") if d_count == 0: return x_count, word_count else: return d_count, word_count
def process_pdf(filePath): # 二进制读取pdf文件 fp = open(filePath, 'rb') parser = PDFParser(fp) # 创建一个PDF文档对象 doc = PDFDocument() # 分析器和文档相互连接 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device) result = '' # 循环遍历列表,每次只处理一个page内容 for page in doc.get_pages(): # doc.get_pages()获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): result += x.get_text() text = re.search("===================\n(.*\n)+ TOTAL INTERNATIONAL", result) result = text.group() data = re.findall(' (\d+.+?\d+)\n', result) print(len(data)) rows = [] for i in data: row = re.findall('\S+', i) # print(row) rows.append(row) df = DataFrame(data=rows) for i in df.columns[3:13]: df[i] = df[i].astype('float64') return df
def readPdf(self, path, callback=None, topath = ""): #以二进制方式打开pdf文件 f = open(path, "rb") #创建一个pdf 文档分析器 parser = PDFParser(f) #创建pdf文档 pdfFile = PDFDocument() #连接分析器与文档对象 parser.set_document(pdfFile) #pdf 连接解析器反向关联 pdfFile.set_parser(parser) #提供初始化密码 pdfFile.initialize("") #检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: #解析数据 manage = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manage, laparams = laparams) #解释器对象 interpreter = PDFPageInterpreter(manage, device) #开始处理,没次处理一页 for page in pdfFile.get_pages(): interpreter.progcess_page(page) layout = device.get_reault() for x in layout: if(isinstance(x, LTTextBoxHorizontal)): if toPath =="": #处理行数据 str = x.get_text() if callback !=None: #回调函数 main 方法 callback(str) else: print("处理文件") else: print("写文件 toPath 写入文件的路径")
def pdf_read(file): """读取pdf文件""" # pip install pdfminer3k from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams from pdfminer.pdfinterp import PDFTextExtractionNotAllowed fp = open(file, 'rb') # 用文件对象创建一个PDF文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器,与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码,如果没有密码,就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: if isinstance(x, LTTextBoxHorizontal): results = x.get_text() common_handle(results)
def pdf_page_content(pdf_file, page_num): ''' 返回某个页面的所有layout, :param pdf_file: pdf文件名 :param page_num: 页数 :return: Generator, layout ''' if page_num <= 0: raise ValueError( 'page_num must be more than zero, but the number your given is %s' % page_num) fp = open(pdf_file, 'rb') parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 分析器和文档相互连接 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码,没有默认为空 doc.initialize() # 检查文档是否可以转成TXT,如果不可以就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器,来管理共享资源 rsrcmagr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() # 将资源管理器和设备对象聚合 device = PDFPageAggregator(rsrcmagr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmagr, device) for index, page in enumerate(doc.get_pages()): if index != page_num - 1: continue else: interpreter.process_page(page) # 接收该页面的LTPage对象 layout = device.get_result() # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象 # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像 # 想要获取文本就得获取对象的text属性 for x in layout: if (isinstance(x, LTTextBoxHorizontal)): yield x
def parse(): '''解析PDF文本,并保存到TXT文件中''' fp = open(text_path,'rb') #用文件对象创建一个PDF文档分析器 parser = PDFParser(fp) #创建一个PDF文档 doc = PDFDocument() #连接分析器,与文档对象 parser.set_document(doc) doc.set_parser(parser) #提供初始化密码,如果没有密码,就创建一个空的字符串 doc.initialize() #检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr,laparams=laparams) #创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr,device) #循环遍历列表,每次处理一个page内容 page = list(doc.get_pages())[47] interpreter.process_page(page) #接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: if(isinstance(x,LTTextBoxHorizontal)): results = x.get_text() if(results.strip()): print(results.strip())
def _pdf_to_text(self, pdf_path, text_path): """ This method does the actual text extraction. It uses PdfMiner Python library to do the extraction. :param pdf_path: path to the input PDF :param text_path: path to the output text :return: True if successful, False otherwise """ text = '' num_pages = 0 doc = PDFDocument() res_mgr = PDFResourceManager() device = PDFPageAggregator(res_mgr, laparams=LAParams()) interpreter = PDFPageInterpreter(res_mgr, device) try: with open(pdf_path, 'rb') as fp: parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') for page in doc.get_pages(): self._logger.debug('Processing page {}'.format(num_pages + 1)) interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) \ or isinstance(lt_obj, LTTextLine): # print(lt_obj.get_text()) text += lt_obj.get_text() num_pages += 1 self._logger.info('Done, extracted {} pages'.format(num_pages)) self._logger.debug('Storing result in {}'.format(text_path)) with open(text_path, 'w') as text_fp: text_fp.write(text.strip()) except: self._logger.warning( 'Extracting text from {} failed'.format(pdf_path)) return False finally: # close resources before exiting device.close() return text is not None and len(text)
def parse_pdf(url): """ 从pdf中抽取内容 :param filename: 要抽取的 pdf路径 :return: 抽取到的pdf的内容 """ get_pdf(url) fp = open('1.pdf', 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) content = "" # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: content = "" else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() content += results content = " ".join(content.replace("\n", "").strip().split()) return content
def onePdfToTxt(filepath, outpath): try: #rb以二进制读模式打开本地pdf文件 fp = open(filepath, 'rb') outfp = open(outpath, 'w', encoding='utf-8') #创建一个pdf文档分析器 parser = PDFParser(fp) #创建一个PDF文档 doc = PDFDocument() #连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDf资源管理器 resource = PDFResourceManager() #创建一个PDF参数分析器 laparams = LAParams() #创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) #创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 doc.get_pages() 获取page列表 for page in doc.get_pages(): #利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) #使用聚合器get_result()方法获取内容 layout = device.get_result() #这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: #判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): text = out.get_text() print(text) outfp.write(text + '\n') fp.close() outfp.close() except Exception as e: print(e)
def parse(pdf_path): fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page = 0 text0_now, text0_last, text1_now, text1_last = "", "", "", "" page_list = [] for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 if num_page == 1: if x.index == 0: text0_now = x.get_text() if x.index == 1: text1_now = x.get_text() else: if x.index == 0: text0_last = text0_now text0_now = x.get_text() if x.index == 1: text1_last = text1_now text1_now = x.get_text() if num_page != 1: if text1_now != text1_last: page_list.append(num_page - 1) # last page if text1_now == text0_last: page_list.append(num_page) # now page return page_list
def readPdf(path, toPath): #以二进制形式打开pdf文件 f = open(path, "rb") #创建一个pdf分析器 parser = PDFParser(f) #创建pdf文档 pdfFile = PDFDocument() #链接分析器与文件分析器 parser.set_document(pdfFile) #提供初始化密码 pdfFile.initialize() #现在是无密码状态 pdfFile.set_parser(parser) #检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: #解析数据 #数据管理器 manager = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() #创建聚合器 device = PDFPageAggregator(manager, laparams=laparams) #解释器对象 interpreter = PDFPageInterpreter(manager, device) #开始循环处理,每次处理一页 for page in pdfFile.get_pages(): #使用页面解释器来获取 # PDFPageInterpreter.process_page(page) interpreter.process_page(page) #使用聚合器获取内容 layout = device.get_result() for x in layout: #判断x是否是LTTextBoxHorizontal类型 if (isinstance(x, LTTextBoxHorizontal)): with open(toPath, "a") as f: #以追加的形式写入pdf # str1 = x.get_text() print(str1) f.write(str1 + "\n")
def get_pdf_content(pdf_path): print(pdf_path) with open(pdf_path, "rb") as file: parser = PDFParser(file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() # When the file can't convert to txt, it will throw an error if not doc.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) content_recoder = [] # process by page for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): content_recoder.append(replace_invisible(x.get_text())) # Page procession done, we need detect page number and remove it try: if content_recoder[-1].isdigit(): content_recoder = content_recoder[:-1] + [" "] except: pass if content_recoder: content_recoder = list( filter(lambda c: c and c != content_recoder[0], content_recoder)) else: pass #raise Exception("No Extracted Content.") return "\n".join(content_recoder)
def convert(infile): parser = PDFParser(infile) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return extracted_text
def parse(): '''解析PDF文本, 并保存到TXT文件中''' fp = open(text_path, 'rb') # 用文件对象作为参数创建一个PDF文档分析器 # 用于解析PDF文件,从文件中获取数据 parser = PDFParser(fp) # 创建一个pdf文档对象 # 用于将数据存储到内存中 doc = PDFDocument() # 连接分析器与文档对象 # 建立连接之后呢,我们就可以通过分析器来访问原先的pdf了 # 而新建的pdf doc我的理解是相当于原先的pdf的镜像 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码,如果没有密码,就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器来共享资源 rm = PDFResourceManager() # 创建PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rm, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rm, device) # 循环遍历列表,每次处理一个page内容 # doc.get_pages()获取page列表 for page in doc.get_pages(): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里的layout是一个LTPage对象,里面存放着这个page解析出的各种对象 # 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(r'new.txt', 'a') as f: results = x.get_text() print(results) f.write(results)
def parse(path,name): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): name_txt=name+".txt" outpath=os.path.join(r"C:\Users\Administrator\Desktop\mission\indexnu\doing\txt",name_txt) with open(outpath, 'a',errors="ignore") as f: results = x.get_text() print(results) f.write(results) f.close()
def extract_papaername(self, path): title = '' contents = [] fp = open(path, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): try: interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): content = x.get_text().replace('\n', '') contents.append(content) except Exception as e: print(e) print('document error...') if not contents: return else: id_index = 11 for indx, line in enumerate(contents[:10]): if '文章编号' in line: id_index = indx break if id_index == 11: title_indx = 0 else: title_indx = id_index + 1 title = contents[:10][title_indx] if len(title.replace(' ', '')) < 4: title = contents[:10][title_indx + 1] return title.replace('\uf02a', '').replace('*', '')