def getPageLayouts(f1): '''Takes a pdf file object, f1, extracts the text-like objects, and returns''' try: '''The parser and doc pair for a "pipe" of sorts''' with open(fpath, 'rb') as f1: parser = PDFParser(f1) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pss_wd) # can we extract text? if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_layouts = [] for page in doc.get_pages(): ''' I *think* we're actually calling on fp here, and not some stored data; the idea is that .pdf files are "too big and complicated" to load all at once, so why not just parse what you need when you need it? ''' interpreter.process_page(page) # receive the LTPage object for the page page_layouts.append(device.get_result()) except IOError: raise IOError, "issue with loading file, please try again" finally: f1.close() return page_layouts
def parsePDFtoTXT(pdf_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument() parser.set_document(document) document.set_parser(parser) document.initialize() if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in document.get_pages(): interpreter.process_page(page) layout = device.get_result() print(layout) output = str(layout) for x in layout: if (isinstance(x, LTTextBoxHorizontal)): text = x.get_text() output += text with open('pdfoutput.txt', 'a', encoding='utf-8') as f: f.write(output)
def readPdf(self, path, callback=None, toPath=""): f = open(path, "rb") parser = PDFParser(f) pdfFile = PDFDocument() parser.set_document(pdfFile) pdfFile.initialize() pdfFile.set_parser(parser) if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: manager = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) interpreter = PDFPageInterpreter(manager, device) for page in pdfFile.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): if toPath == "": #处理每行数据 str = x.get_text() if callback != None: callback(str) #当做函数运行 print(str) else: with open(toPath, "a") as f: str1 = x.get_text() print(str1) f.write(str1 + "\n")
def process(path): nega = posi = unce = liti = cons = supe = inte = 0 fp = open(path, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() fp.close() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text().lower() nega += count_word(results, negative) posi += count_word(results, positive) unce += count_word(results, uncertainty) liti += count_word(results, litigious) cons += count_word(results, constraining) supe += count_word(results, superfluous) inte += count_word(results, interesting) return [nega, posi, unce, liti, cons, supe, inte]
def get_abstract(path): abstract = "" fr = open(path, mode="rb") praser = PDFParser(fr) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() flag = False if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): results = x.get_text() if re.findall("abstract", results.lower()): flag = True if flag and len(results) > 500: abstract = results.replace("-\n", "") abstract = abstract.replace("\n", "") return abstract return abstract
def parse(oldpath, filepath): try: filepath1 = os.path.join(oldpath, filepath) fp = open(filepath1, 'rb') praser_pdf = PDFParser(fp) doc = PDFDocument() praser_pdf.set_document(doc) doc.set_parser(praser_pdf) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) results = [] page = next(doc.get_pages()) interpreter.process_page(page) layout = device.get_result() for out in layout: if isinstance(out, LTTextBoxHorizontal): results.append(out.get_text().strip("\n")) return (results) except Exception as e: print("a", str(e)) return False
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = open(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n')
def ParseAllPages(self, filepath): # Open a PDF file. self.filepath = filepath fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page)
def parse(path): print(path) fp, pf = set_path(path) if os.path.exists(pf): return fp = open(fp, "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): with open(pf, "a", encoding="utf-8") as f: results = x.get_text() f.write(results + "\n")
def harvest_file(self, path): with open(path, 'rb') as fp: # FIXME: how do we know which encoding to use? Should we # use 'chardet' to detect it? encoding = 'utf-8' parser = PDFParser(fp) if HAS_PDFMINER_3K: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) else: doc = PDFDocument(parser) title = doc.info[0].get('Title', '') if isinstance(title, PDFObjRef): title = title.resolve() if isinstance(title, bytes): # This may not be necessary with pdfminer3k. try: title = title.decode(encoding) except UnicodeDecodeError: logger.warning('Could not correctly decode title of "%s".', path) title = title.decode(encoding, 'ignore') fp.seek(0) content = extract_content(fp, encoding).strip() try: content = content.decode(encoding) except UnicodeDecodeError: logger.warning('Could not correctly decode content of "%s".', path) content = content.decode(encoding, 'ignore') return { 'title': title, 'content': content, 'kind': 'PDF', }
def pdf_to_text(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter #<-- changed from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter rsrc = PDFResourceManager() outfp = StringIO() device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) print "There are: " + str(len(list(doc.get_pages()))) + " pages" for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def extractContent(file): print "extractContent" fp = open(file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) rsrcmgr = PDFResourceManager() codec = 'UTF-8' laparams = LAParams() outfp = StringIO.StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) #if not doc.is_extractable: # return None for i, page in enumerate(doc.get_pages()): print "page=" + str(i) if page is not None: interpreter.process_page(page) print "EOF" device.close() fp.close() return outfp.getvalue()
def extractText(file_name): """ extract text in file """ connection = open(file_name, 'rb') parser = PDFParser(connection) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return extracted_text
def text(self) -> List[Tuple[int, str]]: """ Convert pdf pages into a list of text strings. """ from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfparser import PDFParser, PDFDocument with open(self._path, 'rb') as pdf_file: doc = PDFDocument() parser = PDFParser(pdf_file) parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: raise Exception('The Pdf text extraction is not allowed when procesing ' + self._path) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) result = [] for index, page in enumerate(doc.get_pages()): interpreter.process_page(page) layout = device.get_result() current_page_text = ''.join([x.get_text().strip() for x in layout if isinstance(x, LTTextBoxHorizontal)]) result.append((index, current_page_text)) return result
def fetch_pdf_urls(file_name): try: links = [] file_pointer = open(file_name, 'rb') parser = PDFParser(file_pointer) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') # fetches URLs for page in doc: if 'Annots' in page.attrs.keys(): link_object_list = page.attrs['Annots'] # Due to implementation of pdfminer the link_object_list can either # be the list directly or a PDF Object reference if type(link_object_list) is not list: link_object_list = link_object_list.resolve() for link_object in link_object_list: if type(link_object) is not dict: link_object = link_object.resolve() if link_object['A']['URI']: links.append(link_object['A']['URI']) file_pointer.close() return links except Exception as e: logging.error('Error while fetching URLs : ' + str(e)) return ''
def getPDFMetadata(path): result = {} fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() result = doc.info if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() try: result.update( metadata ) # The raw XMP metadata except: pass try: result.update( xmp_to_dict(metadata) ) except: pass return result[0]
def readPdf(dir_and_name, pdf_file, num): fp = pdf_file parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): if 'Airbus' in x.get_text() or 'airbus' in x.get_text(): file_origin = dir_and_name file_target = 'result' + '/' + str(num) + '.' + 'pdf' shutil.copyfile(file_origin, file_target) num += 1 break return num
def create_pages(self): """Apply parsing function, returning the results""" from public_project.models import Page # create a parser object associated with the file object parser = PDFParser(self.pdf_file) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization pdf_pwd = '' doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result doc_pages = self._parse_pages(doc) i = 1 for doc_page in doc_pages: page = Page( document=self.document, number=i, content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'), ) page.save() i = i + 1
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages = dict(enumerate(doc.get_pages())) for num, page in pages.iteritems(): if pagenums and (num not in pagenums): continue interpreter.process_page(page) if maxpages and maxpages <= num + 1: break return pages
class Pdf(object): def __init__(self, pdf_file): parser = PDFParser(pdf_file) self._doc = PDFDocument() parser.set_document(self._doc) self._doc.initialize self._doc.set_parser(parser) @property def pages(self): return len(tuple(self._doc.get_pages())) def to_text(self): rsrcmgr = PDFResourceManager() output = StringIO() laparams = LAParams() laparams.detect_vertical = True laparams.all_texts = True laparams.word_margin = 0.4 device = TextConverter(rsrcmgr, output, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self._doc.get_pages(): interpreter.process_page(page) return output.getvalue().decode('utf-8', 'ignore')
def parse_pdf(pdf_url): remote_file = urllib.request.urlopen(pdf_url).read() memory_file = io.BytesIO(remote_file) parser = PDFParser(memory_file) doc = PDFDocument() parser.set_document(doc) #Warning sometimes, error in pdf? doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ret = [] # Process each page contained in the document. for pageIdx, page in enumerate(doc.get_pages()): ret.append([]) interpreter.process_page(page) layout = device.get_result() for idx, lt_obj in enumerate(layout): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if len(lt_obj.get_text().strip()) > 0: ret[pageIdx].append((lt_obj.get_text().splitlines())) return ret
def parse(): fp=open('20150623043633273.pdf','rb') praser=PDFParser(fp) doc=PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr=PDFResourceManager() laparams=LAParams() device=PDFPageAggregator(rsrcmgr,laparams=laparams) interpreter=PDFPageInterpreter(rsrcmgr,device) for page in doc.get_pages(): interpreter.process_page(page) layout=device.get_result() for x in layout: if(isinstance(x,LTTextBoxHorizontal)): with open(r'2015g.txt')as f: results=x.get_text() print(results) f.write(results+'\n')
def pdf_isvalid(filelike): ''' returns True if valid pdf, else False @param filelike: filelike object, seekable ''' logger = logging.getLogger() isvalid = False filelike.seek(0) if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC: return False else: filelike.seek(0) try: parser = PDFParser(filelike) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') if doc.is_extractable: isvalid = True except PDFException as excobj: logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj)) isvalid = False filelike.seek(0) return isvalid
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ file_pointer = open(self.path, 'rb') parser = PDFParser(file_pointer) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() metadata = Metadata() for i in doc.info: metadata.add(i) if 'Metadata' in doc.catalog: xmp_metadata = resolve1(doc.catalog['Metadata']).get_data() xmp_dict = xmp_to_dict(xmp_metadata) #Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") file_pointer.close() self.metadata = metadata return metadata
def load( self, open_file ): self.fields = {} self.text= {} # Create a PDF parser object associated with the file object. parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for pgnum, page in enumerate( doc.get_pages() ): interpreter.process_page(page) if page.annots: self._build_annotations( page ) txt= self._get_text( device ) self.text[pgnum+1]= txt
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None): if len(args) > 1: prefix = fileOrUrl + ':' fp = None if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'): request = urllib2.Request(fileOrUrl) if basicauth: request.add_header('Authorization', 'Basic ' + basicauth) fobj = urllib2.urlopen(request) pdfdata = fobj.read() fobj.close() fp = StringIO.StringIO(pdfdata) else: fp = open(fileOrUrl, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() fp.close() if textmode: for obj in doc.info: for (name, val) in obj.iteritems(): print '{0}:{1}={2}'.format( fileOrUrl, name, val ) else: val = doc.info if type(val) is list and len(val) == 1: val = val[0] print prefix + str(val)
def getData(self): doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) except: return "error" parser.close() fp.close() #try: # metadata = resolve1(doc.catalog['Metadata']) # return "ok" #except: # print "[x] Error in PDF extractor, Metadata catalog" try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info if self.raw == None: return "Empty metadata" else: return "ok" except Exception,e: return e print "\t [x] Error in PDF extractor, Trailer Info"
def get_toc(self): fp = open(self.pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') # title if doc.info: metadict = doc.info[0] if 'Title' in metadict.keys(): self.title = normalize_title(metadict['Title']) # level 1 of toc try: outlines = doc.get_outlines() toc = list() select_level = self.get_level1(outlines) except: return None for (level,title,dest,a,se) in doc.get_outlines(): if level==select_level: toc.append(normalize_toc_item(title)) return toc
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 parser= PDFParser(f) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) for page in doc.get_pages(): retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_num += 1 interpreter.process_page(page) data = retstr.getvalue() self.parse_page(fpath, bytes(data,'UTF-8'), page_num) retstr.close() self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def WithPdf(self, pdfdoc, password, fn, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdfdoc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization if password: self.password = password doc.initialize(self.password) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def extract_text_elements_from_pdf(path, j=nulljob): """Opens a PDF and extract every element that is text based (LTText). """ fp = open(path, 'rb') doc = PDFDocument(caching=True) parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] all_elements = [] enumerated_pages = list(enumerate(doc.get_pages())) progress_msg = "Reading page %i of %i" for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg): interpreter.process_page(page) page_layout = device.get_result() pages.append(Page(page_layout.width, page_layout.height)) textboxes = extract_textboxes(page_layout) elements = [create_element(box) for box in textboxes] merge_oneletter_elems(elements) for i, elem in enumerate(elements): elem.page = pageno elem.order = i all_elements += elements return pages, all_elements
def read_invoice_pdfminer3k(pdfFile): fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. invoice_text = "" for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): invoice_text += lt_obj.get_text() # Extract client info from the string extracted from pdf client = extract_info(invoice_text, client_start, client_end) print("client :" + client) # Extract invoice no from the pdf file name invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end) print("invoice no :" + invoice_no) # Pass the client info and invoice no to the method which writes to excel file write_excel(client, invoice_no)
class PdfSerializer(object): def __init__(self, filename): self.__filename = filename fp = open(self.__filename, 'rb') parser = PDFParser(fp) self.__doc = PDFDocument() parser.set_document(self.__doc) self.__doc.set_parser(parser) self.__doc.initialize('') def writeToTxt(self): text = self.getString() txtFile = open(self.__filename.replace(".pdf", ".txt"), "w") txtFile.write(text.encode('ascii','replace').decode("utf-8")) txtFile.close() def getString(self): rsrcmgr = PDFResourceManager() laparams = LAParams() string = StringIO() device = TextConverter(rsrcmgr, string, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self.__doc.get_pages(): interpreter.process_page(page) return string.getvalue()
def pdf_function(pdf_doc, password='', *args, **kwargs): result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization doc.initialize(password) if doc.is_extractable: # apply the function and return the result result = function(doc, *args, **kwargs) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def getData(fileName): doc = PDFDocument() fp = file(fileName, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) except: return "error" parser.close() fp.close() try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) metadata=info if metadata == None: return "Empty metadata" else: if metadata.has_key('Author'): print("Author "+metadata['Author']) if metadata.has_key('Company'): print("Company "+metadata['Company']) if metadata.has_key('Producer'): print("Producer "+metadata['Producer']) if metadata.has_key('Creator'): print("Creator "+metadata['Creator']) except Exception,e: print "\t [x] Error in PDF extractor" return e
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def pdf_to_csv(filename): # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def pdf2csv(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pageno, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #import code; code.interact(local=locals()); hlines=[] vlines=[] for i in layout: if not type(i) in (LTRect, LTLine): continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) hlines=filterclose(sorted(set(hlines))) vlines=filterclose(sorted(set(vlines))) print hlines print vlines print (layout.width, layout.height) i=0 im = Image.new('1', (int(layout.width), int(layout.height))) draw = ImageDraw.Draw(im) while(i<len(vlines)-1): if not vlines[i+1]-vlines[i]>5: i=i+1 continue j=0 while(j<len(hlines)-1): if not hlines[j+1]-hlines[j]>5: j=j+1 continue draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1) j=j+1 i=i+1 del draw fp=open("out%s.png" % pageno,'wb') im.save(fp,"PNG") fp.close()
def parse(path): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) paper = pdf() # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 print(page) paper.newPage() size = getPageSize(page) paper.setSize(size) interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: ## print(x) if (isinstance(x, LTTextBoxHorizontal)): paper.newBox() for l in x: paper.newLine() paper.divideWord(l) ## print(l) ## print(l.bbox) ## print("content:"+l.get_text()) ## for c in l: ## print(c,end='\n') ## print() ## return ## if (isinstance(x, LTTextBoxHorizontal)): ## with open(r'1.txt', 'a') as f: ## results = x.get_text() ## print(results) ## f.write(results + '\n') return paper
def MapFactory(map_path): try: map_file = file(map_path, "rb") except: return None document = PDFDocument() try: parser = PDFParser(map_file) parser.set_document(document) document.set_parser(parser) document.initialize("") except: return None obj = document.getobj(_PDF_OBJ_INDEX_) if not obj or not isinstance(obj, PDFStream): return None if not "Width" in obj: return None if not "Height" in obj: return None if not "ColorSpace" in obj: return None width = obj["Width"] height = obj["Height"] map_class = None weird_pdf = height == 1 data = None if weird_pdf: data, height = _ProcessWeirdPDF(document) else: data = obj.get_data() if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT): map_class = MapA4Portrait elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT): map_class = MapA4Landscape elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT): map_class = MapA3Portrait elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT): map_class = MapA3Landscape elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT): map_class = MapA2Portrait elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT): map_class = MapA2Landscape elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT): map_class = MapA1Portrait elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT): map_class = MapA1Landscape else: return None return map_class(_MakePPMImage(width, height, data), map_path)
def get_doc_pages(filename): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() return doc.get_pages()
def parsePDFfile(filepath): fp = open(filepath, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) doc.set_parser(parser) fp.close() return doc
def read_pdf(pdf, fileName): # 创建一个一个与文档关联的解释器 parser = PDFParser(pdf) # PDF文档的对象 doc = PDFDocument() # 连接解释器和文档对象 parser.set_document(doc) doc.set_parser(parser) # 初始化文档,当前文档没有密码,设为空字符串 doc.initialize("") # 创建PDF资源管理器 resource = PDFResourceManager() # 参数分析器 laparam = LAParams() # 创建一个聚合器 device = PDFPageAggregator(resource, laparams=laparam) # 创建PDF页面解释器 interpreter = PDFPageInterpreter(resource, device) for index, page in enumerate(doc.get_pages()): if index == 0: interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, lines = [] for index, x in enumerate(layout): if (isinstance(x, LTTextBoxHorizontal)): #需要写出编码格式 #解决\u8457\u5f55\u683c\u5f0f\uff1a\u67cf\u6167乱码 results = x.get_text().encode('raw_unicode_escape').decode( 'unicode_escape') print(x) lines.append(results) if len(lines) != 0: info = [] info.append(fileName) for index, str in enumerate(lines): if ('经营者姓名' in str or '身份证号码' in str): times = str.count('\n', 0, len(str)) # 出现两次\n过滤出数据 if times == 2: reList = re.findall(".*\n(.*)\n.*", str) if len(reList) != 0: print('过滤取数据 = ' + reList[0]) info.append(reList[0]) else: # 直接取下一个元素 if ((index + 1) < len(lines)): print('取下一个下标数据 = ' + lines[index + 1].strip()) info.append(lines[index + 1].strip()) if len(info) == 3: input.append(info) break
def from_pdf_to_txt(read_file, write_file, page_start=0, page_end=0): """ :param read_file: str. 注意后缀名是".pdf" :param write_file: str. 注意后缀名填".txt" :param page_start: int :param page_end: int :return: """ # 以二进制读模式打开 origin_pdf_file = open(read_file, 'rb') # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(origin_pdf_file) # 创建一个pdf文档 doc = PDFDocument() # 连接分析器与文档对象,这个语句比较有意思,相互set对方进去 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码.如果pdf没有密码,就传入一个空参数 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: # 如果pdf不支持提取,则直接报错 raise PDFTextExtractionNotAllowed else: # 创建pdf资源管理器 来管理共享资源 srcmgr = PDFResourceManager() # 创建一个pdf设备对象 device = PDFPageAggregator(srcmgr, laparams=LAParams()) # 创建一个pdf解释器对象 interpreter = PDFPageInterpreter(srcmgr, device) # 循环遍历列表,每次处理一个page的内容 pages = list(doc.get_pages()) if page_end == 0: page_end = len(pages) for i in range(page_start, page_end): interpreter.process_page(pages[i]) # 接受该页面的LTPage对象 layout = device.get_result() # 这里返回的是一个LTPage对象,里面存放着这个page解析出的各种对象 # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等 # 想要获取文本就取它的text属性,即x.get_text() # 获取text属性 for x in layout: if isinstance(x, LTTextBoxHorizontal): with open(write_file, 'a', encoding='utf-8') as f: results = x.get_text() f.write(results + '\n') # 最后关闭原始pdf文件 origin_pdf_file.close()
def parsePDF(pdfFile): # 以二进制读模式打开 fp = open(pdfFile, 'rb') #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 文本的list textlist = ['.'] # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: # raise PDFTextExtractionNotAllowed return None else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 接受该页面的LTPage对象 interpreter.process_page(page) # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): # with open(r'../../data/pdf/1.txt', 'a') as f: # results = x.get_text() # print(results) # f.write(results + '\n') results = x.get_text() #print(results) #print('---------------------------') textlist.append(results) device.close() return textlist
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages())) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def pdf_is_text(file_path): # TODO return False try: with open(file_path, 'rb') as file: praser = PDFParser(file) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 判断是否加密 if doc.encryption: # pdf = fitz.Document(file_path) # pdf.save('fitz_decrypt.pdf') with open('fitz_decrypt.pdf', 'rb') as f: praser = PDFParser(f) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() # print('is_extractable', doc.is_extractable) # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 first_three = [0, 0, 0] for index, page in enumerate( doc.get_pages()): # doc.get_pages() 获取page列表 if index < 3: interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for i in layout: if isinstance(i, LTTextBoxHorizontal): print(i.get_text()) first_three[index] += len(i.get_text()) else: break # print(first_three) # 如果前三页字数相同且少于50则判断为图片类PDF if max(first_three) < 50: # == min(first_three) and first_three[0] return False else: return True except Exception as ex: return False
def getpaperPDFtitle(paperpdfpath): #行计数 linecount = 0 strtitle = '' #print(paperpdfpath) fp = open(paperpdfpath, 'rb') #用文件对象创建一个PDF文档解析器 parser = PDFParser(fp) #创建一个PDF文档 doc = PDFDocument() #解析器与文档对象绑定 parser.set_document(doc) doc.set_parser(parser) #提供初始化密码,如果没有密码,就创建一个空字符串 doc.initialize() #检测文档是否提供txt转换,不提供忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF资源管理器 rsrcmgr = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) #创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): linecontent = x.get_text() print(linecontent) linecount = linecount + 1 #标题,一般所在的位置 if (linecount == 2): #print(linecontent) #print(isinstance(linecontent,str)) strtitlelist = linecontent.splitlines() if (len(strtitlelist) > 1): for strtemp in strtitlelist: strtitle = strtitle + strtemp strtitle = strtitle + " " elif (len(strtitlelist) == 1): strtitle = strtitle + strtitlelist[0] else: strtitle = '' break break print(strtitle) if (len(strtitle) > 255): return strtitle[:32] else: return strtitle
def get_name(urls, name, time, names): path1 = 'D:\PDF_530\\' fp = open(path1 + names + '.pdf', 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 content = "" for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, path = 'D:\Pdf_text\\' for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(path + names + '.txt', 'a') as f: results = x.get_text() # print(results) f.write(results + '\n') content += results print(urls) if KBoxBrUtils.getCache(urls) == 0: KBoxBrUtils.setCache(urls) print(name) times = time.replace('[', '').replace(']', '') print(times) # print(content) insert = KBoxBrUtils.saveToDB("DT_NEWS_STANDARD_DATA", [{ "NEWS_TITLE": name, "NEWS_TIME": times, "NEWS_CONTENT": content, "NEWS_FROM": "深交所", "NEWS_URL": urls, }]) print(insert)
def convert_pdf_to_txt(_path): # fp = open(_path, 'rb') # rb以二进制读模式打开本地pdf文件 request = Request(url=_path, headers={'User-Agent': USER_AGENT}) # 随机从user_agent列表中抽取一个元素 fp = urlopen(request) # 打开在线PDF文档 # 用文件对象来创建一个pdf文档分析器 praser_pdf = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser_pdf.set_document(doc) doc.set_parser(praser_pdf) # 提供初始化密码doc.initialize("123456") # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 str = '' for page in doc.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 # if hasattr(out,"get_text"): if isinstance(out, LTTextBoxHorizontal): results = out.get_text() str += results return str fp.close()
def load_form(filename): """Load pdf form contents into a nested list of name/value tuples""" with open(filename, 'rb') as file: parser = PDFParser(file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() return [load_fields(resolve1(f)) for f in resolve1(doc.catalog['AcroForm'])['Fields']]
def open_pdf(filepath): """ Read in a PDF file, create a PDFMiner document object and return it. """ fp = open(filepath, 'rb') # Open the file parser = PDFParser(fp) # Create the parser doc = PDFDocument() # Create the document object parser.set_document(doc) doc.set_parser(parser) doc.initialize('') return doc
def parse1(): #rb以二进制读模式打开本地pdf文件 fn = open(r'D:\1\002.pdf', 'rb') #创建一个pdf文档分析器 parser = PDFParser(fn) #创建一个PDF文档 doc = PDFDocument() #连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDf资源管理器 resource = PDFResourceManager() #创建一个PDF参数分析器 laparams = LAParams() #创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) #创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): #利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) #使用聚合器get_result()方法获取内容 layout = device.get_result() #这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: #判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): print(out.get_text()) # with open(r'D:\1\test.txt','a',encoding='utf-8') as f: with open(r'D:\1\test.txt', 'a', encoding='utf-8') as f: f.write(out.get_text() + '\n')
def translate(self): '''读取pdf内容,并翻译,写入txt文件''' # 以二进制读模式打开本地pdf文件 fp = open(self.fullPath, 'rb') # 用文件对象来创建一个pdf文档分析器 praser_pdf = PDFParser(fp) # 创建一个PDF文档 doc_pdf = PDFDocument() # 连接分析器与文档对象 praser_pdf.set_document(doc_pdf) doc_pdf.set_parser(praser_pdf) # 提供初始化密码doc.initialize("123456"),如果没有密码 就创建一个空的字符串 doc_pdf.initialize() # 检查文档是否提供txt转换,不提供就无法翻译文档 if not doc_pdf.is_extractable: Logger().write(self.fileName + '未能提取有效的文本,停止翻译。') return else: # 创建PDF资源管理器来共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 # 循环遍历列表,每次处理一页的内容 for page in doc_pdf.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 if isinstance(out, LTTextBoxHorizontal): content = out.get_text().strip() if content: to_trans_content = content.replace("\r\n", "") ret = translate_func(to_trans_content) trans = ret if ret else '翻译失败' self.write(content) self.write(trans) i += 1 print(i, end=' ', flush=True) time.sleep(2) Logger().write(self.fileName + '翻译完成,新文档:' + self.new_fullPath)
def parse(): # rb以二进制读模式打开本地pdf文件 rpath = r'C:\Users\hdp\Desktop\transformationPaper_EN_CN' # doc_name = r'2-1-5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf' #doc_name = r'[3]Reducing the dimensionality of data with neural networks_science.pdf' # doc_name = r'[2]_Hinton,_Geoffrey_E,_Simon_Osindero,_and_Yee-Whye_Teh_A_fast_learning_algorithm_for_deep_belief_nets_Neural_computation_187_(2006)__1527-1554_.pdf' #seq2seq doc_name = r'[36]_Sutskever,_Ilya,_Oriol_Vinyals,_and_Quoc_V_Le_Sequence_to_sequence_learning_with_neural_networks_Advances_in_neural_information_processing_systems_2014_.pdf' p = rpath+'\\'+doc_name print(p) fn = open(p,'rb') # 创建一个pdf文档分析器 parser = PDFParser(fn) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 resource = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) # 创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 doc for page in doc.get_pages(): # 利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) # 使用聚合器get_result()方法获取内容 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: # 判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): # print(out.get_text(), type(out.get_text())) content = out.get_text().replace(u'\xa0', u' ') # 将'\xa0'替换成u' '空格,这个\xa0就是&nbps空格 # with open('test.txt','a') as f: # f.write(out.get_text().replace(u'\xa0', u' ')+'\n') document.add_paragraph( content, style= 'List Number'#'ListBullet' # 添加段落,样式为unordered list类型 ) document.save(rpath+'\\'+doc_name.replace('.pdf','.docx')) # 保存这个文档
def parse(_path): fp = open(_path, 'rb') # rb以二进制读模式打开本地pdf文件 # fp = urlopen(_path) #打开在线PDF文档 # 用文件对象来创建一个pdf文档分析器 praser_pdf = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser_pdf.set_document(doc) doc.set_parser(praser_pdf) # 提供初始化密码doc.initialize("123456") # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 # if hasattr(out,"get_text"): if isinstance(out, LTTextBoxHorizontal): results = out.get_text() print("results: " + results) with open(r'C:\Users\qinxd\Desktop\test.txt', 'a') as f: f.write(results + '\n') f.close()