def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 parser= PDFParser(f) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) for page in doc.get_pages(): retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_num += 1 interpreter.process_page(page) data = retstr.getvalue() self.parse_page(fpath, bytes(data,'UTF-8'), page_num) retstr.close() self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def _convert_pdf_to_text(self, password=None): input_pdf = self.cvFile if password is not None: self.cvFilePasswd = password pagenos = range(0, 30) maxpages = pagenos.__len__() layoutmode = 'normal' codec = 'utf-8' scale = 1 outtype = 'txt' laparams = LAParams() laparams.all_texts = True laparams.showpageno = True outputPath = self.scratchDir inputPath = os.getcwd() if os.path.exists(input_pdf): inputPath = os.path.dirname(input_pdf) input_filename = os.path.basename(input_pdf) input_parts = input_filename.split(".") input_parts.pop() randomStr = int(time.time()) output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt" self.cvTextFile = output_filename outfp = file(output_filename, 'w') rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(input_pdf, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True) fp.close() device.close() outfp.close() return (0)
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, "rb") parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels}) page_index += 1 fp.close() return result
def dump_pdf_pdfminer(self, fpath_in): fpath_out = os.path.splitext(fpath_in)[0] + ".txt" n = 0 with open(fpath_in, 'rb') as fin: with open(fpath_out, 'wb') as fout: try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() page_num = 0 for page in PDFPage.get_pages(fin, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() fout.write(data) n += len(data) print "Written %d bytes to %s" % (n, fpath_out) except (KeyboardInterrupt, SystemExit): raise except Exception as e: print "Failed parsing %s" % (fpath_in)
def count_words(self): """ Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/ and http://www.unixuser.org/~euske/python/pdfminer/programming.html """ with open(self.filename, "rb") as fp: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) full_text = retstr.getvalue() full_text = full_text.translate(string.maketrans("", ""), string.punctuation) return len(full_text.split())
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) #doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) #doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: pass #raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1, boxes_flow=0.1, detect_vertical=False, all_texts=False) laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def pdf2xml(infile): ''' Return a string of XML representation for given PDF file handle. Uses pdfminer to do the conversion and does some final post-processing. ''' outfile = StringIO() # Empirically determined... laparams = LAParams() laparams.char_margin = 0.4 # See pdf2txt.py rsrcmgr = PDFResourceManager(caching=False) device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) if page_api: for page in PDFPage.get_pages(infile, set()): interpreter.process_page(page) else: process_pdf(rsrcmgr, device, infile, set()) infile.close() return outfile.getvalue().replace("\n", "")
def pdf2str(path): #Allocate resources rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() #Set parameters codec = 'utf-8' laparams.all_texts=True laparams.detect_vertical = True caching = True pagenos = set() #Initialize the converter device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) #Open the file and parse fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True): interpreter.process_page(page) #Clean up fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def _pdf_to_text(path): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') retVal = (txt,True) retstr.close() except Exception,e: #print str(e) #print "\tERROR: PDF is not formatted correctly, aborting." retVal = ("", False) pass
def initialize_pdf_miner(fh, password = None): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def __init__(self, line_overlap=0.5, header_perc=7.5, footer_perc=7.5): LAParams.__init__(self, line_overlap=line_overlap, char_margin=line_overlap, line_margin=line_overlap, word_margin=line_overlap, boxes_flow=line_overlap, detect_vertical=False, all_texts=False) self.header_perc = header_perc # Fraction of the header (% of the page) self.footer_perc = footer_perc # Fraction of the footer (% of the page) return
def parse_pdf(self, test_parse=False): """ Parse a PDF and return text contents as an array """ dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file) # input options pagenos = set() maxpages = 0 # output option codec = "utf-8" caching = True laparams = LAParams() laparams.char_margin = 8.0 laparams.word_margin = 2.0 rsrcmgr = PDFResourceManager(caching=caching) try: outfp = file(self.text_file, "w") except IOError as io_error: raise DTPOFileError(self.text_file, 0, str(io_error)) try: fp = file(self.source_file, "rb") except IOError as io_error: raise DTPOFileError(self.source_file, 0, str(io_error)) try: device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True) except PDFException as pdf_error: message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error)) raise DTPOFileError(self.source_file, 0, message) except Exception as exception: message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception)) raise DTPOFileError(self.source_file, 0, message) fp.close() device.close() outfp.close() # Got the PDF converted = now get it into an array self.file_array = [] for line in open(self.text_file): self.file_array.append(line) # Remove the last entry - it's always '\x0c' if len(self.file_array) > 0: del self.file_array[-1] # Remove the outfile if not test_parse: os.remove(self.text_file)
def to_text(path): """Wrapper around `pdfminer`. Parameters ---------- path : str path of electronic invoice in PDF Returns ------- str : str returns extracted text from pdf """ try: # python 2 from StringIO import StringIO import sys reload(sys) # noqa: F821 sys.setdefaultencoding('utf8') except ImportError: from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ) for page in pages: interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str.encode('utf-8')
def to_text(self): rsrcmgr = PDFResourceManager() output = StringIO() laparams = LAParams() laparams.detect_vertical = True laparams.all_texts = True laparams.word_margin = 0.4 device = TextConverter(rsrcmgr, output, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self._doc.get_pages(): interpreter.process_page(page) return output.getvalue().decode('utf-8', 'ignore')
def get_text(self): """Returns all text content from the PDF as plain text. """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) try: file_pointer = file(self.path, 'rb') process_pdf(rsrcmgr, device, file_pointer) except Exception as e: logging.error("Error processing PDF: %s" % e) raise finally: file_pointer.close() device.close() text = retstr.getvalue() retstr.close() if (text is None) or (text.strip() == ""): logging.info("No text found in PDF. Attempting OCR. This will take a while.") #FIXME this should go in a separate method #First, convert to image import subprocess try: arglist = ["gs", "-dNOPAUSE", "-sOutputFile=temp/page%03d.png", "-sDEVICE=png16m", "-r72", self.path] process = subprocess.call( args=arglist, stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) except OSError: logging.error("Failed to run GhostScript (using `gs`)") #Do OCR import time time.sleep(1) # make sure the server has time to write the files import Image import pytesseract import os text = "" for file_ in os.listdir("temp"): if file_.endswith(".png"): text += pytesseract.image_to_string(Image.open("temp/" + file_), lang="swe") os.unlink("temp/" + file_) self.text = text return text
def GetScript(filename): global scriptName ResetGlobals() scriptName = filename password = "" # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "---Not translatable---" return #raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.boxes_flow = 2 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum,page in enumerate(PDFPage.create_pages(document)): if pgnum == 0: continue interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() text = [] for page in layout: try: if page.get_text().strip(): text.append(TextBlock(page.x0,page.y1,page.get_text().strip())) except: temp=5 print ".", text.sort(key = lambda row:(-row.y)) # Parse all of the "line" objects in each page for line in text: ParseLine(line.text, line.x)
def _pdf2text(self,fp): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') # TODO: clean this up, I feel like I'm doing the converstion twice ... # http://stackoverflow.com/a/16503222/2154772 parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() #print doc.info[0]['CreationDate'].resolve() # # as messed up as this is ... CreationDate isn't always the same type as it # comes back from the PDFParser, so we need to base it on an instance of a # basestring or not. # created = "" try: if not isinstance(doc.info[0]['CreationDate'],basestring): creatd = doc.info[0]['CreationDate'].resolve()[2:-7] else: created = doc.info[0]['CreationDate'][2:-7] except: self._report("CreationDate field could not be decoded within PDF, setting to ''") pass created = created.encode('ascii','ignore') retVal = (created,txt,True) retstr.close() except Exception, e: self._report("Error: \n\t%s" % str(e)) retVal = (None,"",False) pass
def extractrefs(infile, outfile): pagenos = set() caching = True infp = open(infile, 'rb') outfp = open(outfile, 'w') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 1.4 device = RefsExtractor(rsrcmgr, outfp, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(infp, pagenos, caching=caching, check_extractable=True): interpreter.process_page(page) infp.close() outfp.close()
def initialize_pdf_interpreter(): # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return interpreter, device
def _pdf2text(self,fp): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') # TODO: clean this up, I feel like I'm doing the converstion twice ... # http://stackoverflow.com/a/16503222/2154772 parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() #print doc.info[0]['CreationDate'].resolve() # # as messed up as this is ... CreationDate isn't always the same type as it # comes back from the PDFParser, so we need to base it on an instance of a # basestring or not. I'm starting to dislike PDFs ... # if not isinstance(doc.info[0]['CreationDate'],basestring): datestring = doc.info[0]['CreationDate'].resolve()[2:-7] else: datestring = doc.info[0]['CreationDate'][2:-7] #print "working on '{0}'...".format(datestring) ts = strptime(datestring, "%Y%m%d%H%M%S") created = datetime.fromtimestamp(mktime(ts)) retVal = (created,txt,True) retstr.close() except Exception, e: self._reportstr("Error: \n\t%s" %str(e)) retVal = (None,"",False) pass
def getPdfAsText(pdfPages = None, fileDescriptor = None): if pdfPages is None and fileDescriptor is not None: pdfPages = getPdfPages(fileDescriptor) resourceManager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True try: outputStream = StringIO.StringIO() device = TextConverter(resourceManager, outputStream, laparams=laparams) intrepreter = PDFPageInterpreter(resourceManager, device) for pdfPage in pdfPages: intrepreter.process_page(pdfPage) return outputStream.getvalue() finally: device.close() outputStream.close()
def pdf(f): rsrcmgr = PDFResourceManager() retstr = cStringIO.StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams ) fp = file(f, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def output_pdf_to_table(path): fp = open(path, "rb") rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = line_margin_threshold codec = 'utf-8' device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password="" maxpages=pages_to_view caching=True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) layout = device.get_result() getRows(layout)
def readpdf(pdfFile): fp = open(pdfFile, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) #doc.initialize('password') # leave empty for no password rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.1 seems to work laparams.line_margin = float('0.8') device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for this page device.get_result() #print(device.rows) df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String']) return df
def to_text(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() print laparams laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def pdfcn(): laparams = LAParams() laparams.all_texts = True ###从之前程序崩溃的地方重新启动,查找转换的pdf filelist4 = [] finallist = [] path2 = r'D:\dataset\acl10_12_txt' filelist2 = os.listdir(path2) path3 = r'D:\dataset\acl10_12s' filelist3 = os.listdir(path3) for i in filelist2: filelist4.append(i[:-4]) print filelist4 for filename in filelist3: #print filename[:-4] if filename[:-4] not in filelist4: finallist.append(filename[:-4]) #print finallist #path = r'D:\dataset\aclpdf2' #filelist = os.listdir(path) for pdf in finallist: try: outfile = "D:\\dataset\\acl10_12_txt\\"+pdf+".txt" codec = 'utf-8' args = [path3+'\\'+pdf+'.pdf'] rsrc = PDFResourceManager() outfp = file(outfile, 'w') device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, None, maxpages=0, password='') print '%s finishing ' % pdf fp.close() except: continue device.close() outfp.close()
def convert_pdf_to_txt(path, txtname, buf=True): rsrcmgr = PDFResourceManager() if buf: outfp = StringIO() else: outfp = file(txtname, 'w') codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrcmgr, outfp, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() device.close() if buf: text = re.sub(space, "", outfp.getvalue()) print (text) outfp.close()
def read_file(self): with open(self.path, 'rb') as f: parser = PDFParser(f) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 0.1 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = [] for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text.append(lt_obj.get_text()) self.content = ' '.join(extracted_text)
def get_text_from_pdf(pdfname, caption, skip_header, skip_footer): # PDF 読み込み fp = open(pdfname, 'rb') texts = [] for page in tqdm( PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True)): rsrcmgr = PDFResourceManager() out_fp = StringIO() la_params = LAParams() la_params.detect_vertical = True device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) texts.append(out_fp.getvalue()) device.close() out_fp.close() fp.close() output = "" # 文章成形 for text in tqdm(texts): lines = text.splitlines() replace_strs = [b'\x00'] # 除去するutf8文字 new_lines = [] for line in lines: line_utf8 = line.encode('utf-8') for replace_str in replace_strs: line_utf8 = line_utf8.replace(replace_str, b'') line = line_utf8.decode() line = re.sub("[ ]+", " ", line) # 連続する空白を一つにする line = line.strip() if len(line) == 0: continue # 空行は無視 if is_float(line): continue # 数字だけの行は無視 new_lines.append(line) for index in range(len(new_lines)): if index == 0 and skip_header: continue if index == len(new_lines) - 1 and skip_footer: continue line = new_lines[index] # 見出しで改行 if is_float(line.split(".")[0]) and len( line.split()) < caption and (not line.endswith(".")): output += str(line) output += "\r\n" continue if line.endswith("."): output += str(line) output += "\r\n" elif line.endswith("-"): # 前の行からの続きの場合 output += str(line[:-1]) elif line.endswith(":"): # 数式が続く場合 output += str(line) output += "\r\n" else: # それ以外の場合は、単語の切れ目として半角空白を入れる output += str(line) output += " " return output
def request_pdf(url, case_id, court_name): try: response = requests.request("GET", url, verify=False, proxies=proxy_dict) if response.status_code == 200: res = response.text if res is None: logging.error("No data for: " + str(case_id)) return "NULL" file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(case_id) + ".pdf" fw = open(file_path, "wb") fw.write(response.content) text_data = "" pdf_manager = PDFResourceManager() string_io = StringIO() pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text) for page in PDFPage.get_pages(open(file_path, 'rb')): interpreter.process_page(page) text_data = string_io.getvalue() file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(case_id) + ".txt" fw = open(file_path, "w") fw.write(str(text_data)) return str(text_data) else: logging.error("Failed to get text file for: " + str(case_id)) return "NULL" except Exception as e: logging.error("Failed to get pdf file for: " + str(case_id) + ". Error: %s", e) return "NULL"
def extract_text(my_file): """Pulling text boxes out of PDFs. First half of this defn copies off the internet.""" try: #my_file = os.path.join(base_path + "/" + filename) #my_file = os.path.join(dayDataPath, frontPages[paper]) password = "" extracted_text = "" extracted_text_plus = [] # Open and read the pdf file in binary mode fp = open(my_file, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort #if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: #print(lt_obj) #extracted_text_plus.append(lt_obj) if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text_plus.append(lt_obj) #print(layout) #close the pdf file fp.close() #save the text #with open(log_file, "wb") as my_log: # my_log.write(extracted_text.encode("utf-8")) ###Finally getting to my contributions.### #Headlines are assumed to be large text. By comparing the number of lines of text in a textbox #with the height of the textbox, the average size of the text can be found. #Text that's larger than average is kept. df = pd.DataFrame() df['cords'] = 0 df['num'] = 0 df['height'] = 0 df['text'] = '' df['TL_X'] = -1 df['TL_Y'] = -1 df['width'] = -1 nums = [] heights = [] for n in range(0, len(extracted_text_plus)): cords = str(extracted_text_plus[n]).split(' ')[1].split(',') vals = [float(elm) for elm in cords] a, b, c, d = vals text = ' '.join(str(extracted_text_plus[n]).split(' ')[2:]) h = d - b #float(cords[3])-float(cords[1]) w = c - a #nums.append(n) #heights.append(h) #print(cords) df.loc[n, 'cords'] = ' '.join(cords) df.loc[n, 'num'] = n df.loc[n, 'height'] = h df.loc[n, 'width'] = w df.loc[n, 'TL_X'] = a df.loc[n, 'TL_Y'] = b df.loc[n, 'text'] = text df['newlines'] = 0 for x in range(0, len(df)): df.loc[x, 'newlines'] = df.loc[x, 'text'].count('\\n') df['text height'] = df['height'] / df['newlines'] return df except: pass
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted (remote or local) :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ if not isinstance(pdf_path, io.BytesIO): # extract text from local pdf file with open(pdf_path, 'rb') as fh: try: for page in PDFPage.get_pages( fh, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return else: # extract text from remote pdf file try: for page in PDFPage.get_pages( pdf_path, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return
args = parser.parse_args() # Open a PDF file. fp = open(args.filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams( line_overlap=0.1, char_margin=0.1, line_margin=0.5, word_margin=0.1, boxes_flow=0.5, ) # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) INFO_FIRST_ROW = 720 INFO_SECOND_ROW = 650 FUZZINESS = 14 FUZZINESS_X = FUZZINESS FUZZINESS_Y = FUZZINESS
def scientific_analysis(password, path, title, topn): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO print('Convering pdf to text ...') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password_pdf = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password_pdf, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() text = text.replace('-\n', '').replace('’', "'").replace('infl', 'infl') lines = text.split('\n') lines_section_ids_dict = {} lines_section_ids = [] for i, line in enumerate(lines[1:-2]): if len(lines[i - 1]) == 0 and len(lines[i + 1]) == 0 and len( lines[i]) > 3 and not str(lines[i]).isdigit(): lines_section_ids_dict[i] = lines[i] lines_section_ids.append(i) data = [] for id in lines_section_ids_dict: data.append((lines_section_ids_dict[id], id)) data = dict(data) final_data = {} new_txt = '' try: ref_id = data['References'] except KeyError: ref_id = len(lines) - 1 for i, id in enumerate(lines_section_ids): if i < len(lines_section_ids) - 1 and id < ref_id: start = lines_section_ids[i] end = lines_section_ids[i + 1] interval_lines = lines[start + 1:end] interval_lines_txt = ' '.join(interval_lines) if 'Abbreviations' not in lines_section_ids_dict[ start] and '18 of 36' not in lines_section_ids_dict[start]: new_txt += interval_lines_txt if interval_lines and len(interval_lines_txt) > 100: final_data[lines_section_ids_dict[start]] = ' '.join( interval_lines) final_data['paper_title'] = title final_data['full_text'] = new_txt final_data['topn'] = topn print('Uploading text ...') response = requests.post( 'http://tzagerlib1-env.eba-wjp8tqpj.eu-west-2.elasticbeanstalk.com/scientific_analysis/' + password, json=json.dumps(final_data)) if response.status_code == 200: data = dict(response.json()) else: data = {'error': response.status_code} data = dict(data) return data
elif str == "Backstroke": return "BackStroke" elif str == "Breaststroke": return "BreastStroke" elif str == "Individual": return "IndividualMedley" else: return str elif ir == "r": if str == "Freestyle": return "FreeRelay" elif str == "Medley": return "MedleyRelay" # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams(detect_vertical=True) # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager() # ページを集めるPageAggregatorオブジェクトを作成。 device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 interpreter = PDFPageInterpreter(resource_manager, device) # 出力用のテキストファイル filename = os.path.basename(sys.argv[1]) outputfilename = "output/text/" + os.path.splitext(filename)[0] + "_out.txt" outputfile = open(outputfilename, 'w')
def __init__( self, file, merge_tags=('LTChar', 'LTAnno'), round_floats=True, round_digits=3, input_text_formatter=None, normalize_spaces=True, resort=True, parse_tree_cacher=None, ): # store input self.merge_tags = merge_tags self.round_floats = round_floats self.round_digits = round_digits self.resort = resort # set up input text formatting function, if any if input_text_formatter: self.input_text_formatter = input_text_formatter elif normalize_spaces: r = re.compile(r'\s+') self.input_text_formatter = lambda s: re.sub(r, ' ', s) else: self.input_text_formatter = None # open doc if not hasattr(file, 'read'): try: file = open(file, 'rb') except TypeError: raise TypeError("File must be file object or filepath string.") parser = PDFParser(file) if hasattr(QPDFDocument, 'set_parser'): # pdfminer < 20131022 doc = QPDFDocument() parser.set_document(doc) doc.set_parser(parser) else: # pdfminer >= 20131022 doc = QPDFDocument(parser) parser.set_document(doc) if hasattr(doc, 'initialize'): # as of pdfminer==20140328, "PDFDocument.initialize() method is # removed and no longer needed." doc.initialize() self.doc = doc self.parser = parser self.tree = None self.pq = None self.file = file if parse_tree_cacher: self._parse_tree_cacher = parse_tree_cacher self._parse_tree_cacher.set_hash_key(self.file) else: self._parse_tree_cacher = DummyCache() # set up layout parsing rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, detect_vertical=True) self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) # caches self._pages = [] self._pages_iter = None self._elements = []
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def parse(): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) pdftext='' # 循环遍历列表,每次处理一个page的内容 pagei=1 x1 = 1 for page in doc.get_pages(): # doc.get_pages() 获取page列表 print(x1) x1=x1 +1 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): #with open(r'E:\scrapy\json\1.txt', 'a') as f: str = x.get_text() results=str #results = str.replace('\n', ',') #print(results) # f.write(results + '\n') if (str.find('601668')>0): print(str) break #pdftext=pdftext+results pdftext = pdftext+'\n' pagei = pagei+1 print(pagei) start_keyword='重仓线' end_keyword='1,建仓线是指值得买入的价位,这个价位是相对低位,不存在追高风险。' # pdftext = pdftext.replace('\n', '') pat = re.compile(start_keyword + '(.*?)' + end_keyword, re.S) result = pat.findall(pdftext) print('result',result) filename=r'E:\scrapy\json\tushare.csv' convert2csv(result,filename)
# PDF文档的对象 doc = PDFDocument() # 链接解释器和文档 parser.set_document(doc) doc.set_parser(parser) # 初始化文档 doc.initialize('') # 没有密码,空字符串 # 创建PDF资源管理器 resource = PDFResourceManager() # 参数分析器 laparam = LAParams() # 创建一个聚合器 device = PDFPageAggregator(resource, laparams=laparam) # 创建页面解释器 interpreter = PDFPageInterpreter(resource, device) # 使用文档对象读取内容 for page in doc.get_pages(): # 使用页面解释器读取 interpreter.process_page(page) # 使用聚合器获得内容 layout = device.get_result()
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def get_text_and_coordinates(pdf_path): # Extract the room prefix from level in the pdf_path room_prefix = int(pdf_path.split(os.sep)[-1].split('-')[-1][:1]) - 1 # Open a PDF file. fp = open(pdf_path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. resource_manager = PDFResourceManager() # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. la_params = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(resource_manager, laparams=la_params) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(resource_manager, device) def parse_obj(lt_objects): # (x0, y0) = Bottom left corner, (x1, y1) = Top right corner df_dictionary = { 'x0': [], 'y0': [], 'x1': [], 'y1': [], 'width': [], 'height': [], 'text': [] } # loop over the object list for obj in lt_objects: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): # Use some basic filtering: Remove letters, add hyphens, ignore combined rooms text = re.sub('[^0-9]', '', obj.get_text()) if not text.startswith(str(room_prefix)): continue # Ignore noise that gives room numbers that cannot possibly belong to the floor text_len = len(text) if text_len > 0: bbox = obj.bbox width = bbox[2] - bbox[0] height = bbox[3] - bbox[1] if text_len == 5: text = text[:3] + '.' + text[3:] elif text_len > 5 or text_len < 3: continue # Currently just ignoring those few rooms which are problematic df_dictionary['x0'].append(bbox[0]) df_dictionary['y0'].append(bbox[1]) df_dictionary['x1'].append(bbox[2]) df_dictionary['y1'].append(bbox[3]) df_dictionary['width'].append(width) df_dictionary['height'].append(height) df_dictionary['text'].append(text) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs) return pd.DataFrame.from_dict(df_dictionary) # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object df = parse_obj(layout._objs) return df
def processAddendaPdf(absDocUrl): print 'Parsing addenda pdf %s ...' % absDocUrl pdfbin = urllib.urlopen(absDocUrl).read() cin = StringIO.StringIO() cin.write(pdfbin) cin.seek(0) parser = PDFParser(cin) doc = PDFDocument() parser.set_document(doc) try: doc.set_parser(parser) doc.initialize() assert doc.is_extractable # outlines = doc.get_outlines() # for (level,title,dest,a,se) in outlines: # print (level, title) # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. revHistFound = False revision = [] for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() xlines, ylines, tlines = extractLinesText(layout) for tline in tlines: lineText = tline.get_text() #print lineText if not revHistFound: match = re.match('.*HISTORY OF REVISIONS.*', lineText) if match: revHistFound = True print 'Revision History found' else: match = re.match('(?P<version_info>\d+)\s*', lineText) if match: versionInfo = match.group('version_info') revision.append(versionInfo) if len(revision) == 2: break if len(revision) > 0: break if len(revision) < 2: raise BaseException('Could not find revision info') else: revision = '.'.join(sorted(revision)) except BaseException as e: print 'ERROR: %s' % str(e) revision = 'ERROR while parsing the PDF: %s' % str(e) print 'Revision: %s' % revision data = {'revision': revision} return data
def parse(): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument(praser) # 连接分析器 与文档对象 praser.set_document(doc) # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 wb = Workbook() #新建excel ws = wb.active # 记录page的行数 text_number = 0 for page in PDFPage.create_pages(doc): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, # 得到box page_container = [] #存储所有该page的字符串字典 page_rows = [] #存储行位置数据 for text_box in layout: if (isinstance(text_box, LTTextBox)): # 得到line for text_line in text_box: if (isinstance(text_line, LTTextLine)): # 得到每个字符 temp = [] # 存储得到的字符 temp_loc = [] #存储字符串位置 isfirst = True #判断是否为字符串的第一个字符 for text_index in text_line: # 判断是否为字符数据,并不断更新temp temp_loc if (isinstance(text_index, LTChar)): temp.append(text_index.get_text()) if isfirst == True: temp_loc.append( round(text_index.bbox[0], 3)) temp_loc.append( round(text_index.bbox[1], 3)) temp_loc.append( round(text_index.bbox[2], 3)) temp_loc.append( round(text_index.bbox[3], 3)) isfirst = False temp_loc[2] = round(text_index.bbox[2], 3) temp_loc[3] = round(text_index.bbox[3], 3) # 判断是否为LTText,并将得到的字符串输入page_container的指定位置,最后更新temp 、temp_loc、 isfirst elif (isinstance(text_index, LTText)): # 如果page_rows没有该行的位置数据,则将数据信息插入page_container,page_rows # if temp_loc[1] not in page_rows: if is_not_in(page_rows, temp_loc[1]): insert_loc = insert_into_page_rows( page_rows, temp_loc[1]) page_container.insert( insert_loc, [{ 'value': ''.join(temp), 'location': temp_loc }]) # page_rows.append(temp_loc[1]) # page_container.append([{'value':''.join(temp),'location':temp_loc}]) # 如果有该行的信息 elif not is_not_in(page_rows, temp_loc[1]): # loc = page_rows.index(temp_loc[1]) loc = get_page_rows_loc( page_rows, temp_loc[1]) temp_list = insert_into_page_container( page_container[loc], { 'value': ''.join(temp), 'location': temp_loc }) page_container[loc] = temp_list[:] temp = [] temp_loc = [] isfirst = True rows_num = len(page_container) # 对最后一行进行重排 if len(page_container[rows_num - 1]) != len( page_container[rows_num - 2]): loc_for_no2 = [] loc_for_no1 = [] adjust_for_no1 = [] temp_array = page_container[rows_num - 1][:] for i in page_container[rows_num - 2]: loc_for_no2.append([i['location'][0], i['location'][2]]) for i in page_container[rows_num - 1]: loc_for_no1.append([i['location'][0], i['location'][2]]) for i in range(len(loc_for_no1)): for j in range(len(loc_for_no2)): if not (loc_for_no1[i][0] > loc_for_no2[j][1] or loc_for_no1[i][1] < loc_for_no2[j][0]): adjust_for_no1.append(j) break page_container[rows_num - 1] = [] for i in range(len(page_container[rows_num - 2])): if i in adjust_for_no1: page_container[rows_num - 1].append( temp_array[adjust_for_no1.index(i)]) else: page_container[rows_num - 1].append(None) # 对前五行进行重排 if len(page_container[0]) != len(page_container[1]) or len( page_container[1]) != len(page_container[2]) or len( page_container[2]) != len(page_container[3]) or len( page_container[3]) != len(page_container[4]): rows_length = [] the_max_row = [] new_max_row = [] for i in range(6): rows_length.append(len(page_container[i])) max_length = max(rows_length) the_max_row = page_container[rows_length.index(max_length)][:] for i in range(len(rows_length)): if rows_length[i] < max_length: page_container[i] = align_row(the_max_row, page_container[i]) # 检测表头 # 输出验证 for i in range(len(page_container)): for j in range(len(page_container[i])): print(page_container[i][j]) # print(page_container) # print(page_rows) # 得到该页数据以后写入excel for i in range(len(page_container)): for j in range(len(page_container[i])): cell_index = ws.cell(row=i + 1 + text_number, column=j + 1) if page_container[i][j] == None: cell_index.value = ' ' else: cell_index.value = page_container[i][j]['value'] # 更新text_number,保证page之间的数据连续 text_number += rows_num wb.save(r'C:\Users\15644\Desktop\pdf_file\test_pdf_list\test_1.xlsx')
def anotate_pdf(file_path, sht, query_dict): # preparing the output file name path = pathlib.Path(file_path).parent extension = pathlib.Path(file_path).suffix name = pathlib.Path(file_path).name[:-len(extension)] result_file = str(path) + '\\' + name + '_highlighted' + extension #========================================================= # create a parser object associated with the file object parser = PDFParser(open(file_path, 'rb')) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # Layout Analysis # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # create pdf layout - this is list with layout of every page layout = [] for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for the page. layout.append(device.get_result()) # add tooltip info not sure how to use this option in the most usefull way m_meta = {"author": "AK", "contents": "HL text1"} outputStream = open(result_file, "wb") pdfInput = PdfFileReader(open(file_path, 'rb'), strict=True) pdfOutput = PdfFileWriter() npage = pdfInput.numPages for pgn in range(0, npage): for query in query_dict: all_coor = [] for page in layout: result = get_page_coordinates(page, query) all_coor.append(result) page_hl = pdfInput.getPage(pgn) for item in all_coor[pgn]: highlight = create_highlight(item[0], item[1], item[2], item[3], m_meta, color=query_dict[query]) highlight_ref = pdfOutput._addObject(highlight) if "/Annots" in page_hl: page_hl[NameObject("/Annots")].append(highlight_ref) else: page_hl[NameObject("/Annots")] = ArrayObject( [highlight_ref]) pdfOutput.addPage(page_hl) # save HL to new file pdfOutput.write(outputStream) outputStream.close() sht.range('B2').value = f'File {name+extension} completed'
def parse(file_name): fp = open(file_name, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) useful = [] doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_number = 0 temp_use = [] temp_dict = { "name":"", "LASID":"", "DOB":"", "Grade":"", "RD":"", "School":"", "District":"", "Score":"", "Score_level":"", "low_top":"", "course":"", } for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() read_flag = 0 for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() print (results) if page_number%2 == 0 and read_flag==0: #temp_use.append(results) temp_dict["course"] = results.split("\n")[0] read_flag = 1 continue else: if get_name(results): temp_dict["name"] = get_name(results) if get_LASID(results): temp_dict["LASID"] = get_LASID(results) if get_DOB(results): temp_dict["DOB"] = get_DOB(results) if get_Grade(results): temp_dict["Grade"] = get_Grade(results) if get_RD(results): temp_dict["RD"] = get_RD(results) if get_School(results): temp_dict["School"] = get_School(results) if get_District(results): temp_dict["District"] = get_District(results) if get_Score(results): temp_dict["Score"] = get_Score(results) if get_Score_level(results): temp_dict["Score_level"] = get_Score_level(results) #print ("hhhh") if get_low_top(results): temp_dict["low_top"] = get_low_top(results) #print (temp_dict) #input("==") #page_number += 1 #print (page_number) #if page_number%2 == 0: if 1: #print (temp_dict) useful.append(temp_dict) #input("=======") temp_dict = { "name":"", "LASID":"", "DOB":"", "Grade":"", "RD":"", "School":"", "District":"", "Score":"", "Score_level":"", "low_top":"", } return useful
def parse(): with open("schedule/{}".format(cfg.get("schedule_file")), "rb") as fp: parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs): for obj in lt_objs: if isinstance(obj, LTTextBoxHorizontal): coor = getTextCoords(obj.bbox[0:2]) text = obj.get_text().replace('\n', ' ') # check if content contains a date match = re.search(r"\d{2}/\d{2}/\d{4}", text) if match: data["dates"].append({ "date": match.group(), "coords": coor }) match = re.findall(r"\d{1,2}:\d{2}", text) if match: data["hours"].append({ "hours": list(map(lambda x: "{0:0>5}".format(x), match)), "coords": coor }) data["textboxes"].append([coor, text, ""]) if isinstance(obj, LTRect): data["rects"].append(getRectCoords(obj.bbox[0:4])) if isinstance(obj, LTFigure): parse_obj(obj._objs) if LOG_TEXTS: with open("outputs/" + cfg.get("folder") + "/pdf_texts.txt", "w", encoding="utf8") as log: log.write("") with open("outputs/" + cfg.get("folder") + "/pdf_svg.html", "w", encoding="utf8") as svg: ''' SVG HEAD ''' if CREATE_SVG: svg.write( "<style type=\"text/css\">svg{stroke:#000;stroke-width:1;fill:none}</style>\n" ) i = 0 # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() ''' CREATE SVG ''' if CREATE_SVG: svg.write( "<svg id=\"s{}\" width=\"1200\" height=\"600\">\n". format(i)) data["rects"] = [] data["textboxes"] = [] data["dates"] = [] data["datelines"] = [] data["hours"] = [] # extract info from this page parse_obj(layout._objs) lines = rectsToLines(data["rects"]) lines = mergeLines(lines) lines.sort(key=lambda x: x[1][1]) lines.sort(key=lambda x: x[0][1]) grid = createGrid(lines) data["textboxes"] = mergeTexts(grid, data["textboxes"]) data["textboxes"] = splitSimultaneousCourses(data["textboxes"]) data["hours"].sort(key=lambda x: x["coords"][1]) if data["hours"]: calcHourBoundaries(grid) if data["dates"]: calcDateBoundaries(grid) # keyword matching for each textbox for t in data["textboxes"]: t[1] = " ".join(t[1].split()) res = keywords.match(format_text(t[1])) if len(res["indexes"]) == 1: data["courses"][res["indexes"][0]] = { "coords": t[0], "date": getDate(t[0]) } t[2] = " (match: {})".format(res["titles"][0]) ''' DRAW LINES ''' if CREATE_SVG: minX, maxX = 1e10, 0 for l in lines: svg.write( "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#{}\"></line>\n" .format(l[0][0], l[0][1], l[1][0], l[1][1], randomColor())) if l[0][0] < minX: minX = l[0][0] if l[1][0] > maxX: maxX = l[1][0] if SHOW_DATELINES: for h in data["hours"]: svg.write( "<circle cx=\"{}\" cy=\"{}\" r=\"1\" stroke=\"red\"></circle>\n" .format(h["coords"][0], h["coords"][1])) for d in data["dates"]: if d["boundaries"][0] != 0 and d["boundaries"][ 1] != 0: svg.write( "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n" .format(minX, d["boundaries"][0], maxX, d["boundaries"][0])) svg.write( "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n" .format(minX, d["boundaries"][1], maxX, d["boundaries"][1])) if SHOW_TEXTBOXES: for t in data["textboxes"]: svg.write( "<text x=\"{}\" y=\"{}\" font-size=\"4\" font-weight=\"lighter\">{}</text>\n" .format(t[0][0], t[0][1], t[1][:5])) if LOG_TEXTS: with open("outputs/" + cfg.get("folder") + "/pdf_texts.txt", "a", encoding="utf8") as log: for t in data["textboxes"]: log.write("{}, {}, {}{}\n".format( t[0][0], t[0][1], t[1], t[2])) ''' CLOSE SVG ''' if CREATE_SVG: svg.write('</svg>' + "\n") i += 1 coursedates = {} for key, c in data["courses"].items(): coursedates[key] = c["date"] write(coursedates)
def pdf2text(stream: IO[bytes]) -> TextIOWrapper: """Return a text stream from a PDF stream.""" bytes_stream = BytesIO() extract_text_to_fp(stream, bytes_stream, laparams=LAParams()) bytes_stream.seek(0) return TextIOWrapper(bytes_stream, "utf-8")
def pdf_to_csv(filename, separator, threshold): #from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) self.separator = separator self.threshold = threshold def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda: {}) for child in self.cur_item._objs: # <-- changed if isinstance(child, LTChar): (_, _, x, y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) # <-- changed for y in sorted(lines.keys()): line = lines[y] self.line_creator(line) self.outfp.write(self.line_creator(line)) self.outfp.write("\n") def line_creator(self, line): keys = sorted(line.keys()) # calculate the average distange between each character on this row average_distance = sum( [keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys) # append the first character to the result result = [line[keys[0]]] for i in range(1, len(keys)): # if the distance between this character and the last character is greater than the average*threshold if (keys[i] - keys[i - 1]) > average_distance * self.threshold: # append the separator into that position result.append(self.separator) # append the character result.append(line[keys[i]]) printable_line = ''.join(result) return printable_line # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() ft = 'txt\\' + filename + '.txt' outfp = open(ft, 'w') #outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) fp = open(filename, 'rb') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(PDFPage.get_pages(fp)): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) else: print 'none' outfp.write("END PAGE %d\n" % i) device.close() fp.close() outfp.close() #return outfp.getvalue() return 0
from io import StringIO from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser output_string = StringIO() with open('simple1.pdf', 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) print(output_string.getvalue())
def createDeviceInterpreter(): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter
import matplotlib.patches as patches from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox, LTTextLine with open('/home/cyan/Downloads/Barron-s-1100-Words-You-Need-table.pdf', 'rb') as pdf_doc: parser = PDFParser(pdf_doc) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = [] for page in doc.get_pages(): page_text = [] interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox): page_text.append(lt_obj) extracted_text.append(page_text)
# print("aaaa") # # print ("fontname %s"%c.fontname) # # print ("fontname %s"%c.fontsize) # if it's a container, recurse elif isinstance(objs[i], pdfminer.layout.LTFigure): parse_obj_title(objs[i]._objs) else: pass # print("adhjabjha",new_text) return new_text document = open('2b_Agents.pdf', 'rb') #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) # interpreter.process_page(pages) #page is the iterator of the pages, it is for one single page object title = [] content = [] for page in PDFPage.get_pages(document): interpreter.process_page(page) layout = device.get_result() if layout.pageid > 1: # print ("aaa") if parse_obj(layout._objs) == True:
def convert_pdf_to_txt_csv(path): retstr = io.StringIO() codec = 'utf-8' df = pd.DataFrame() fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' #Create DataFrame df = pd.DataFrame() page_list = [] sent_list = [] n = 0 for page in doc.get_pages(): n += 1 interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): t = lt_obj.get_text() t = re.sub('\\d+', ' ', t) t = t.replace('.....', ' ') t = re.sub('[\s+]', ' ', t) t = re.sub(' +', ' ', t) t = sent_tokenize(t) if len(t) > 0: for each in t: sent_list.append(each) page_list.append(n) else: t = ''.join(t) page_list.append(n) sent_list.append(t) df['Page No.'] = page_list df['Sentence'] = sent_list #Clean dataframe df = df[df.Sentence != ' '] #Filter sentence which len() > mask = (df['Sentence'].str.len() > 10) df = df.loc[mask] #Remove duplicate df = df.drop_duplicates('Sentence', keep='first', inplace=False) #Set Sentencec No. of Doc df['Sentence No. of Doc'] = df.groupby('Page No.').cumcount() + 1 #FileName fileName = os.path.basename(path) df['Source Document'] = fileName #Country Name Country = fileName.split('_')[0] df['Country'] = Country fp.close() return df
def pdf2txt(self): ''' ============================= return : str, text File path ''' # input password = '' pagenos = set() maxpages = 0 # output imagewriter = None rotation = 0 codec = 'UTF-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() infp = open(self.input_path, "rb") if self.output_path == None: self.output_path = self.input_path[:-4] + '_trans.txt' outfp = open(self.output_path, "w", encoding='UTF8') else: outfp = open(self.output_path, "w", encoding='UTF8') #page total num parser = PDFParser(infp) document = PDFDocument(parser) page_total_num = resolve1(document.catalog['Pages'])['Count'] # rsrcmgr = PDFResourceManager(caching=caching) # pdf -> text converter device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # pdf -> text interpreter interpreter = PDFPageInterpreter(rsrcmgr, device) # pdf -> text start with tqdm(total=page_total_num) as pbar: for page in PDFPage.get_pages(infp, pagenos, maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) pbar.update(1) print('[INFO] pdf -> text') outfp.close() infp.close() return self.output_path
def parse(DataIO, save_path, start=None, end=None): # 用文件对象创建一个PDF文档分析器 parser = PDFParser(DataIO) # 创建一个PDF文档 doc = PDFDocument(parser) #分析器和文档相互连接 parser.set_document(doc) #doc.set_parser(parser) # 提供初始化密码,没有默认为空 #doc.initialize() # 检查文档是否可以转成TXT,如果不可以就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器,来管理共享资源 rsrcmagr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() # 将资源管理器和设备对象聚合 device = PDFPageAggregator(rsrcmagr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmagr, device) # 循环遍历列表,每次处理一个page内容 #pages = PDFPage.get_pages(doc) # doc.get_pages()获取page列表 #for page in pages: page_num = 0 for page in PDFPage.create_pages(doc): page_num = page_num + 1 if start is not None and end is not None: if page_num < start: continue if page_num > end: break interpreter.process_page(page) # 接收该页面的LTPage对象 layout = device.get_result() f = open('./text/' + str(page_num) + '.txt', 'w') #with open('%s' % (save_path), 'a') as f: # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象 # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对象 # 想要获取文本就得获取对象的text属性 for x in layout: #try: if isinstance(x, LTTextBoxHorizontal): # 得到文本 result = x.get_text() try: print( "***************** LTTextBoxHorizontal ************" ) print(result) #if len(result) >= 15: # 写到文件中 f.write(result + "\n") except: print('写入文件错误', result) pass if isinstance(x, LTTextBox): print("***************** LTTextBox ************") print(x.get_text()) if isinstance(x, LTFigure): print("***************** LTFigure ************") parse_lt_figure(x, page_num, f) if isinstance(x, LTImage): print("***************** LTImage ************") saved_file = save_image(x, page_num) print('save image ' + x.name) if isinstance(x, LTChar): print('ppppppppppppppp') print(x.get_text()) f.write(x.get_text()) if isinstance(x, LTCurve): print("***************** LTCurve ************") f.close()
def pdf2csv(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pageno, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #import code; code.interact(local=locals()); hlines = [] vlines = [] for i in layout: if not type(i) in (LTRect, LTLine): continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) hlines = filterclose(sorted(set(hlines))) vlines = filterclose(sorted(set(vlines))) print hlines print vlines print(layout.width, layout.height) i = 0 im = Image.new('1', (int(layout.width), int(layout.height))) draw = ImageDraw.Draw(im) while (i < len(vlines) - 1): if not vlines[i + 1] - vlines[i] > 5: i = i + 1 continue j = 0 while (j < len(hlines) - 1): if not hlines[j + 1] - hlines[j] > 5: j = j + 1 continue draw.rectangle([(int(hlines[j]), int(vlines[i])), (int(hlines[j + 1]), int(vlines[i + 1]))], outline=1) j = j + 1 i = i + 1 del draw fp = open("out%s.png" % pageno, 'wb') im.save(fp, "PNG") fp.close()
def writeCSV(dirpdf): nameCSVfile = 'pdfFiles.csv' codec = 'utf-8' password = "" maxpages = 0 caching = True pagenos = set() save_dir = 'IMAGES' with open(nameCSVfile, 'w') as csvFile: fields = ['File', 'Kind', 'Text'] # dir of folder and filter for pdf files files = [ f for f in os.listdir(dirpdf) if os.path.isfile(os.path.join(dirpdf, f)) ] files = list(filter(lambda f: f.endswith(('.pdf', '.PDF')), files)) # variables for print information cnt_files = len(files) i = 0 writer = csv.DictWriter(csvFile, fieldnames=fields) writer.writeheader() for filepdf in files: row = dict() try: filename = os.path.join(dirpdf, filepdf) fp = open(filename, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False): try: interpreter.process_page(page) data = retstr.getvalue() if (len(data) < 2 or len(data) > 100000): base_filename = os.path.splitext( os.path.basename(filename))[0] + '.jpg' imgPath = os.path.join(save_dir, base_filename) data = extract_text_image(imgPath) row = [{ 'File': filepdf, 'Kind': filepdf.split('.')[0], 'Text': data }] except Exception as ex: print(filepdf) print(ex) base_filename = os.path.splitext( os.path.basename(filename))[0] + '.jpg' imgPath = os.path.join(save_dir, base_filename) data = extract_text_image(imgPath) row = [{ 'File': filepdf, 'Kind': filepdf.split('.')[0], 'Text': data }] break # Cleanup device.close() retstr.close() except Exception as ex: print(filepdf) print(ex) row = [{ 'File': filepdf, 'Kind': filepdf.split('.')[0], 'Text': 'Exception' }] i += 1 # show an update every 50 pdf if (i > 0 and i % 50 == 0): print("[INFO] processed {}/{}".format(i, cnt_files)) writer.writerows(row) csvFile.close()
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: """ Parse CAS pdf and returns line data. :param filename: CAS pdf file (CAMS or Kfintech) :param password: CAS pdf password :return: array of lines from the CAS. """ file_type: Optional[FileType] = None if isinstance(filename, str): fp = open(filename, "rb") elif isinstance(filename, io.IOBase): fp = filename elif hasattr(filename, "read"): # compatibility for Django UploadedFile fp = filename else: raise CASParseError( "Invalid input. filename should be a string or a file like object") with fp: pdf_parser = PDFParser(fp) try: document = PDFDocument(pdf_parser, password=password) except PDFPasswordIncorrect: raise CASParseError("Incorrect PDF password!") except PDFSyntaxError: raise CASParseError("Unhandled error while opening file") line_margin = { FileType.KFINTECH: 0.1, FileType.CAMS: 0.2 }.get(detect_pdf_source(document), 0.2) rsrc_mgr = PDFResourceManager() laparams = LAParams(line_margin=line_margin, detect_vertical=True) device = PDFPageAggregator(rsrc_mgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrc_mgr, device) pages: List[Iterator[LTTextBoxHorizontal]] = [] investor_info = None for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() text_elements = filter( lambda x: isinstance(x, LTTextBoxHorizontal), layout) if file_type is None: for el in filter(lambda x: isinstance(x, LTTextBoxVertical), layout): if re.search("CAMSCASWS", el.get_text()): file_type = FileType.CAMS if re.search("KFINCASWS", el.get_text()): file_type = FileType.KFINTECH if investor_info is None: investor_info = parse_investor_info(layout, *page.mediabox[2:]) pages.append(text_elements) lines = group_similar_rows(pages) return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)