def get_tables(fh): """ Return a list of 'tables' from the given file handle, where a table is a list of rows, and a row is a list of strings. """ result = [] doc, interpreter, device = initialize_pdf_miner(fh) doc_length = len(list(PDFPage.create_pages(doc))) for i, pdf_page in enumerate(PDFPage.create_pages(doc)): #print("Trying page {}".format(i + 1)) if not page_contains_tables(pdf_page, interpreter, device): #print("Skipping page {}: no tables.".format(i + 1)) continue # receive the LTPage object for the page. interpreter.process_page(pdf_page) processed_page = device.get_result() (table, _) = page_to_tables( processed_page, extend_y=False, hints=[], atomise=True) crop_table(table) result.append(Table(table,i+1,doc_length,1,1)) return result
def GetScript(filename): global scriptName ResetGlobals() scriptName = filename password = "" # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "---Not translatable---" return #raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.boxes_flow = 2 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum,page in enumerate(PDFPage.create_pages(document)): if pgnum == 0: continue interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() text = [] for page in layout: try: if page.get_text().strip(): text.append(TextBlock(page.x0,page.y1,page.get_text().strip())) except: temp=5 print ".", text.sort(key = lambda row:(-row.y)) # Parse all of the "line" objects in each page for line in text: ParseLine(line.text, line.x)
def calculate_locations(filename,keywords): locations = [] fp = open(filename, 'rb') parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) pagenum = 0 reader = PdfFileReader(file(filename,"rb")) for page in pages: interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() page = reader.getPage(pagenum) x = page.trimBox[0].as_numeric() y = page.trimBox[1].as_numeric() #Handling special case if (x > 0 and y < 0): x = 0 # print "At page = %s X = %s , y = %s"%(pagenum,x,y) for keyword in keywords: print '********************************' co_ordinates = get_location(keyword,layout,x,y) print'Keyword %s , location %s'%(keyword,co_ordinates) print '********************************' if co_ordinates != None : for location in co_ordinates: print "PageNum-->%s"%pagenum l = LocationKeeper(keyword,location,pagenum) locations.append(l) pagenum+=1 return locations
def dwn_pdf_txt(url): """ Given a readable but encrypted PDF URL, parses document to text """ r = requests.get(url) memory_file = StringIO(r.content) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) # Define parameters to the PDF device objet rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = 'utf-8' # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec = codec, laparams = laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document for page in PDFPage.create_pages(document): interpreter.process_page(page) parsed_document = retstr.getvalue() return parsed_document # everything is stored here, needs to be cleaned up
def setup(path): # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF device object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # now extract dialogue from for i, page in enumerate(PDFPage.create_pages(document)): # skip the title page if i > 0: # process page with interpreter interpreter.process_page(page) # get layout info layout = device.get_result() # iterate through layout objects for obj in layout: # we only want to bother with LTTextBox and LTTextLine if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine): # only extract text segments within a certain margin range if obj.bbox[0] > DIALOGUE_BBOX_MIN and obj.bbox[0] < DIALOGUE_BBOX_MAX: # need to convert unicode characters converted = unicodedata.normalize('NFKD', obj.get_text()).encode('ascii', 'ignore') print(converted)
def pdf_to_txt(in_file): """ turn a PDF file to a TXT file (roughly processed) """ # Open a PDF file. fp = open(in_file, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Set parameters for analysis. laparams = LAParams() # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # Receive the LTPage object for the page. layout = device.get_result() for klass in layout: if isinstance(klass, LTTextBoxHorizontal): out_file = in_file[:-3] + 'txt' with open(out_file, 'a') as dst_file: text = klass.get_text().encode('utf-8') dst_file.write(text + '\n') return None
def readPdf(file): # Open a PDF file. fp = open(file, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams(line_margin=0.1) pages = [] # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in islice(PDFPage.create_pages(document), 2): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() pages.append(layout) return pages
def convert_pdf_table(pdf_file): pdf_file = open(pdf_file, 'rb') parser = PDFParser(pdf_file) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) table = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() page_table = tabulate_page(layout) header = page_table[0] rows = page_table[1:] for row in rows: row_dict = {} for item, detail in enumerate(row): if detail != '': row_dict[header[item].lower()] = detail table.append(row_dict) return table
def parsepdf(filename): fp = open(filename, 'rb') parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() laparams = LAParams() # Create a PDF device object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. found_randers = False found_aarhus = False _randers = [] headings = [u'Ledige lejligheder\n',u'afd. adresse\n',u'rum m2\n',u'leje \n', u'a\xb4c varme a\xb4c vand\n',u'indskud\n',u'ledig pr.\n',u'bem\xe6rkning\n' ] location_map = OrderedDict() header_ycord = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for obj in layout._objs: # print obj if isinstance(obj,LTTextBoxHorizontal): for o in obj._objs: y0 = o.y0 # print o if isinstance(o,LTTextLineHorizontal) and obj.get_text() not in headings: if y0 not in header_ycord: if y0 in location_map : objs = location_map.get(y0) else: objs = [] string_val = o.get_text().encode('ascii', 'ignore') string_val = string_val.replace('\n','') objs.append(string_val) location_map.__setitem__(y0,objs) else : if y0 not in header_ycord: header_ycord.append(y0) for key in location_map: print '**************************' # # print key print location_map.get(key) print '**************************' print 'Total Rowss = %s'%len(location_map)
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def read_fields(pdffile): outfields = list() fp = open(pdffile, 'rb') id_to_page = dict() parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1; for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT') logmessage("name is " + str(name) + " and FT is |" + str(field_type) + "|") if page is not None: pageno = id_to_page[page.objid] else: pageno = 1 if str(field_type) == '/Btn': if value == '/Yes': default = "Yes" else: default = "No" elif str(field_type) == '/Sig': default = '${ user.signature }' else: if value is not None: default = value else: default = word("something") outfields.append((name, default, pageno, rect, field_type)) return outfields
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def dumppdf(fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) res = "" if objids: for objid in objids: obj = doc.getobj(objid) res += dumpxml(obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) res += dumpxml( obj, codec=codec) else: res += dumpxml(page.attrs) #print "before dumpall" if dumpall: res += dumpallobjs( doc, codec=codec) #print "after dumpall" if (not objids) and (not pagenos) and (not dumpall): res += dumptrailers( doc) fp.close() if codec not in ('raw','binary'): res += '\n' #print "end proc" return res
def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1): self.parser = PDFParser(file) self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) if PYTHON_3: self.doc = PDFDocument() self.parser.set_document(self.doc) self.doc.set_parser(self.parser) self.doc.initialize(password) else: self.doc = PDFDocument(self.parser, password) if not check_extractable or self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams) self.interpreter = PDFPageInterpreter( self.resmgr, self.device) if PYTHON_3: page_generator = self.doc.get_pages() else: page_generator = PDFPage.create_pages(self.doc) for page in page_generator: self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup()
def generateFileContent(self): import tempfile import urllib abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157" temporaryFile = tempfile.NamedTemporaryFile() urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name) entries = set() fileObject = open(temporaryFile.name, "rb") parser = PDFParser(fileObject) document = PDFDocument(parser) resourceManager = PDFResourceManager() device = PDFPageAggregator(resourceManager) interpreter = PDFPageInterpreter(resourceManager, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() objects = [object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve)] params = LAParams() for line in layout.group_objects(params, objects): text = line.get_text() if u":" in text: entry = text.split(u":")[0] entry = entry.strip() entry = entry.replace(u"..", ".") entries.add(entry) dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n" dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n" dictionary += u"\n" for entry in formatEntriesForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def main(argv): infile = sys.argv[1] outfile = 'test.xhtml' fp = file(infile, 'rb') outfp = file(outfile, 'w') # OR sys.stdout password = '' codec = 'utf-8' caching = True parser = PDFParser(fp) document = PDFDocument(parser, password=password, caching=caching) rsrcmgr = PDFResourceManager(caching=caching) device = XHTMLConverter(rsrcmgr, outfp, codec=codec, laparams=LAArticle(), document=document) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) fp.close() device.close() outfp.close() return
def getPDFText(path): ''' Takes in PDF files and converts to Python human-readable Python data. Input: path -> full path to PDF file Output: String representation of parsed data ''' retstr = StringIO() parser = PDFParser(open(path, 'r')) try: document = PDFDocument(parser) except Exception: print path + 'is not a readable pdf' return '' if document.is_extractable: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, retstr, codec = 'ascii', laparams = LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) return retstr.getvalue() else: print path, "Warning: could not extract text from PDF file." return ''
def analyze_pages(file_name): ''' Input: the file path to the PDF file Output: yields the layout object for each page in the PDF ''' # Open a PDF file. with open(os.path.realpath(file_name), 'rb') as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password = '') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams(char_margin = 2.0, word_margin = 0.1, detect_vertical = True) # Create a PDF page aggregator object. device = CustomPDFPageAggregator(rsrcmgr, laparams = laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page_num, page in enumerate(PDFPage.create_pages(document)): try: interpreter.process_page(page) except OverflowError as oe: print oe, ', skipping page', page_num, 'of', file_name traceback.print_exc() continue layout = device.get_result() yield layout
def main(): # Open a PDF file. with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() print rsrcmgr # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): print interpreter.process_page(page) outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: print (level, title) return 0
def parse_pages(pdf_buffer, password): """ With an PDF buffer object, get the pages, parse each one, and return the entire pdf text """ # Create a PDF parser object associated with the file object. parser = PDFParser(pdf_buffer) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) resource_manager = PDFResourceManager() la_params = LAParams() device = PDFPageAggregator(resource_manager, laparams=la_params) interpreter = PDFPageInterpreter(resource_manager, device) text_content = [] # a list of strings, each representing text collected from each page of the doc for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for this page layout = device.get_result() # layout is an LTPage object which may contain # child objects like LTTextBox, LTFigure, LTImage, etc. text_content.append(parse_lt_objects(layout._objs)) # pylint: disable=protected-access return text_content
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use PyPDF2 to extract textual content first. If none is found, it'll send the file through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, languages) result['pages'].append(text) device.close() return result
def parsePDF(url): # Open the url provided as an argument to the function and read the content open = urllib2.urlopen(Request(url)).read() # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(open) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) # Define parameters to the PDF device objet rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = 'utf-8' # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec = codec, laparams = laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document for page in PDFPage.create_pages(document): interpreter.process_page(page) data = retstr.getvalue() print type(data) sys.exit()
def parsePDF(pdf_file): pdf_file = open(pdf_file, "r").read() # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(pdf_file) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) # Define parameters to the PDF device objet rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = "utf-8" # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document for page in PDFPage.create_pages(document): interpreter.process_page(page) data = retstr.getvalue() print data break
def Layout(): # Set parameters for analysis. with open('/home/chris/Documents/Literature/Donghun_ACSNano_2014', 'rb') as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() print rsrcmgr laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() return layout
def parse_page(self): # Create a PDF resource manager object that stores shared resources rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = list() page_count = 0 for i, page in enumerate(PDFPage.create_pages(self.document)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() if self.pages: if page_count == self.pages: break page_count += 1 self.parse_layout_objects(layout._objs, (i+1))
def iter_tables(fh, x_comb = None, y_comb = None, hints = []): """ iterate over the tables in a document. See get_tables for the non-iter version. :param x_comb: Specify x_comb and y_comb to override the automatic comb creation. :param y_comb: :param hints: tuple of strings to search for to determine the y limits of the page. """ doc, interpreter, device = initialize_pdf_miner(fh) pdf_iter = PDFPage.create_pages(doc) for i, pdf_page in enumerate(pdf_iter): interpreter.process_page(pdf_page) # receive the LTPage object for the page. processed_page = device.get_result() if not page_contains_tables(processed_page, device): #print("Skipping page {}: no tables.".format(i + 1)) continue (table, diag) = page_to_tables( processed_page, extend_y=True, hints=hints, atomise=True, x_comb = x_comb, y_comb = y_comb) crop_table(table) yield Table(table, i+1, -1, 1, 1), diag
def fix_text(self, filename): # Open a PDF file. pdfText = StringIO() fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. if not self.password: document = PDFDocument(parser) else: document = PDFDocument(parser, self.password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = TextConverter(rsrcmgr, pdfText, codec=self.codec , laparams=LAParams(), imagewriter=None ) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) txt = pdfText.getvalue() return txt
def parse_pdf(fname): fp = open(fname, 'rb') # 来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager() # 设定参数进行分析 laparams=LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams) # 创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) # 处理每一页 contents = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout=device.get_result() for x in layout: if(isinstance(x, LTTextBoxHorizontal)): content = x.get_text().strip() # print type(content) # print content if content: contents.append(content) return contents
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ fh = open(path, "rb") result = {"pages": []} try: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, "") if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() v = string_value(v) if k != "pages" and v is not None and "<PDFObjRef:" not in v: result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages)) device.close() return result except PSEOF as eof: log.info("Unexpected EOF: %r", eof) return result finally: fh.close()
def parsePDF(filename): fp = open(filename, 'rb') #来创建一个pdf文档分析器 parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager() # 设定参数进行分析 laparams=LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams) # 创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) # 处理每一页 result = [] for page in PDFPage.create_pages(document): pageResult = parsePage(page, interpreter, device, filename) result.append(pageResult) return result
def parse_pdf(self, fp): parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) page.annots and self.parse_annotations(pgnum, page)
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = {page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)} def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def extract_text_from_pdf(fobj): parser = PDFParser(fobj) doc = PDFDocument(parser) text = "" for page_number, page in enumerate(PDFPage.create_pages(doc), start=1): rsrcmgr = PDFResourceManager() laparams = LAParams() result = io.StringIO() device = TextConverter(rsrcmgr, result, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) text += result.getvalue() return text
def parse(DataIO, save_path, paperID): ''' It is used to pare PDF, and save the content to the target path ''' # create pdf parser parser = PDFParser(DataIO) # create pdf document try: doc = PDFDocument(parser) except PDFSyntaxError: print("can't parse this file!") with open('data/nonpdfdoc.txt', 'a') as f: f.write(paperID) f.write('\n') return # link document and parser parser.set_document(doc) # check if the document can be converted to text if not doc.is_extractable: print("Can't Parse this File! Ignore it and keep parsing") with open('data/parseFailed.txt', 'a') as f: f.write(paperID) f.write('\n') raise PDFTextExtractionNotAllowed else: # create pdf source manager rsrcmagr = PDFResourceManager() # create PDF device obj laparams = LAParams() device = PDFPageAggregator(rsrcmagr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmagr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) # get the LTPage obj, including LTTextBox, LTFigure, # LTImage, LTTextBoxHorizontal layout = device.get_result() for x in layout: try: if isinstance(x, LTTextBoxHorizontal): with open('%s' % save_path, 'a') as f: result = x.get_text() f.write(result + '\n') # print("Parse pdf successfully") except: with open('data/parseFailed.txt', 'a') as f: f.write(paperID) f.write('\n') print("Failed")
def read_pdf(pdf_path): try: fp = open(pdf_path, 'rb') # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个 PDF 文档 doc = PDFDocument(parser=parser) # 连接分析器 与文档对象 parser.set_document(doc) # 检测文档是否提供txt转换,不提供就忽略; 当然对于不提供txt转换的PDF 可以采用OCR 技术 if not doc.is_extractable: messagebox.showerror( message='无法解析 PDF 文件 {},请重新选择。'.format(pdf_path)) return # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理文档对象中每一页的内容 # doc.get_pages() 获取page列表 # 循环遍历列表,每次处理一个page的内容 # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, page_count = 0 content = '' for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): result = x.get_text() content += result print(result) page_count += 1 # with open(pdf_path, 'rb') as f: # pdf_reader = PdfFileReader(f, strict=False) # page_count = pdf_reader.getNumPages() # # page_count = len(pdf_reader.pages) # content = None # for i in range(page_count): # page = pdf_reader.getPage(i) # page_text = page.extractText() # page_text = page_text # content = page_text if content is None else content + page_text + '\n' return page_count, content except PdfReadError: messagebox.showerror(message='{}文件已加密或损坏,请重新选择。'.format(pdf_path)) traceback.print_exc()
def parsepdf(self): # Open a PDF file. fp = open(self.filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print('extraction not allowed') raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 # loop over all pages in the for page in PDFPage.create_pages(document): # int i to keep track of page numbers i+=1 # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object self.parse_page(layout._objs, i) return self.word_array #test = PDFpos("FinancialAccounting1.pdf") #test.parsepdf()
def get_page_layout( filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, detect_vertical=True, all_texts=True, ): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs. Parameters ---------- filename : string Path to pdf file. char_margin : float line_margin : float word_margin : float detect_vertical : bool all_texts : bool Returns ------- layout : object PDFMiner LTPage object. dim : tuple Dimension of pdf page in the form (width, height). """ with open(filename, "rb") as f: parser = PDFParser(f) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed laparams = LAParams( char_margin=char_margin, line_margin=line_margin, word_margin=word_margin, detect_vertical=detect_vertical, all_texts=all_texts, ) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() width = layout.bbox[2] height = layout.bbox[3] dim = (width, height) return layout, dim
def convert(self, source_path: str = None) -> None: """Parse source PDF into entities which can be used for text searches, for example. This is also used inside other PDF keywords. **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Example Keyword Convert /tmp/sample.pdf **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() def example_keyword(): pdf.convert("/tmp/sample.pdf") :param source_path: source PDF filepath. """ self.ctx.switch_to_pdf(source_path) source_parser = PDFParser(self.ctx.active_pdf_document.fileobject) source_document = PDFDocument(source_parser) source_pages = PDFPage.create_pages(source_document) rsrcmgr = PDFResourceManager() laparams = pdfminer.layout.LAParams( detect_vertical=True, all_texts=True, ) device = Converter(self.ctx.active_pdf_document, rsrcmgr, laparams=laparams) interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device) # Look at all (nested) objects on each page for _, page in enumerate(source_pages, 0): interpreter.process_page(page) self.ctx.active_pdf_document = device.close() self.ctx.active_pdf_document.is_converted = True
def parse(in_path, out_path, start_num, end_num): fp = open(in_path, 'rb') # 以二进制读模式打开# praser = PDFParser(fp) # 用文件对象来创建一个pdf文档分析器 doc = PDFDocument(praser) # 创建一个PDF文档 praser.set_document(doc) # 连接分析器 与文档对象 if start_num == 0 and end_num == 0: mode = 1 elif start_num != 0 and end_num == 0: mode = 2 elif start_num != 0 and end_num != 0: mode = 3 temp_num = 0 if not doc.is_extractable: # 检测文档是否提供txt转换,不提供就忽略 raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() # 创建PDf 资源管理器 来管理共享资源 laparams = LAParams() # 创建一个PDF设备对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建一个PDF解释器对象 for page in PDFPage.create_pages( doc): # 循环遍历列表,每次处理一个page的内容 其中doc.get_pages()获取page列表 if mode == 2: temp_num += 1 if temp_num < start_num: continue elif temp_num > start_num: break elif mode == 3: temp_num += 1 if temp_num < start_num: continue elif temp_num > end_num: break interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if ( isinstance(x, LTTextBoxHorizontal) ): #需要写出编码格式 解决\u8457\u5f55\u683c\u5f0f\uff1a\u67cf\u6167乱码 with open(out_path, 'a', encoding='utf-8') as out_txt: results = x.get_text() #print(results) out_txt.write(results + '\n') return
def build_frequency_matrix(my_pdf, arr_stemmed_ontology, stemmer): frequency_matrix = [] #set up document for PDFMiner fp = open(my_pdf, "rb") parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) #initialize page counter page_number = 0 #variable to store processed text for the whole pdf document doc_text = "" for page in PDFPage.create_pages(document): #Create string of text both by page and append each page to create string of text for entire doc #variable to store processed text for each page page_text = "" interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): page_text += lt_obj.get_text() #process the text: stem all words page_text = process_pdf(page_text, stemmer) doc_text += page_text frequency_matrix.append([]) for key in arr_stemmed_ontology.keys(): #key = word (not stemmed) #synonym_list = stemmed list word/synonyms synonym_list = arr_stemmed_ontology[key] freq = 0 #find number of occurance of all synonyms on a page #add this value to frequency_matrix at index of current page for syn in synonym_list: freq += len(re.findall(syn, page_text)) frequency_matrix[page_number].append(freq) page_number += 1 return frequency_matrix, doc_text
def mine_pdf(fp): print('mining pdf') with open(Path(fp), 'rb') as file: parser = PDFParser(file) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a buffer for the parsed text retstr = StringIO() # Spacing parameters for parsing #https://github.com/obeattie/pdfminer/wiki/pdfminer.layout laparams = LAParams(char_margin=4.0, word_margin=0) #print(laparams.__dict__) codec = 'utf-8' # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) lines = retstr.getvalue().splitlines() text = "" print('iterate over lines') for i in range(len(lines)): lines[i] = lines[i].lower() if lines[i] == '': lines[i] = ' ' elif lines[i][-1] == '-': lines[i] = lines[i][:-1] else: lines[i] = lines[i] + ' ' text += lines[i] return text
def hash_from_file(cls, file: pathlib.Path) -> str: if not str(file).endswith(".pdf"): warnings.warn("File does not appear to be a pdf. ", category=UserWarning) return "" text = StringIO() with open(file, "rb") as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, text, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return str(tlsh.hash(text.getvalue().encode()))
def __init__(self, document, laparams=None): if laparams is None: laparams = LAParams() self._page_iterator = iter(PDFPage.create_pages(document)) self._page = next(self._page_iterator) rsrcmgr = PDFResourceManager() self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) self._lines = None self.i_page = 1 self.is_at_end = False
def pdf2text(path): """Converts the PDF to text Based on the implementation developed by: https://pdfminersix.readthedocs.io/en/latest/tutorial/composable.html """ output_string = StringIO() with open(path, "rb") as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return output_string.getvalue()
def read_fields(pdffile): outfields = [] fp = open(pdffile, 'rb') id_to_page = {} parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1 for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 if 'AcroForm' not in doc.catalog: return [] fields = resolve1(doc.catalog['AcroForm'])['Fields'] recursively_add_fields(fields, id_to_page, outfields) return sorted(outfields, key=fieldsorter)
def iterate_pages( pdf_fn: str, use_advanced_detection: bool = False) -> Generator[LTPage, None, None]: with open(pdf_fn, 'rb') as pdf_f: parser = PDFParser(pdf_f) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, grid_size=0) if use_advanced_detection \ else LAParams(all_texts=True, boxes_flow=None, grid_size=0) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) page_layout: LTPage = device.get_result() yield page_layout
def _parse_pages(doc, images_folder): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [ ] # a list of strings, each representing text collected from each page of the doc for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for this page layout = device.get_result() # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc. text_content.append(parse_lt_objs(layout._objs, (i + 1), images_folder)) return text_content
def le_pdf(filename_or_fobj): filename, fobj = get_filename_and_fobj(filename_or_fobj) parser = PDFParser(fobj) doc = PDFDocument(parser) texto = '' for num_pagina, pagina in enumerate(PDFPage.create_pages(doc), start=1): rsrcmgr = PDFResourceManager() laparams = LAParams() resultado = io.StringIO() conversor = TextConverter(rsrcmgr, resultado, laparams=laparams) interpretador = PDFPageInterpreter(rsrcmgr, conversor) interpretador.process_page(pagina) texto += resultado.getvalue() return texto.strip()
def extract_text_from_pdf(pdf_path): output_string = StringIO() with open(pdf_path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) text = output_string.getvalue() final_txt = text.replace("\n", " ") return final_txt
def read_pdf_PDFMINER(pdf_file_path): """ pdf_file_path: 'dir/aaa.pdf'로 구성된 path로부터 내부의 text 파일을 모두 읽어서 스트링을 리턴함. """ output_string = StringIO() with open(pdf_file_path, 'rb') as f: parser = PDFParser(f) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return str(output_string.getvalue())
def convert_pdf_to_string(file_path): output_string = StringIO() with open(file_path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) words = nltk.word_tokenize(output_string.getvalue()) words = [word.lower() for word in words if word.isalpha()] return (words)
def PdfToString(path_to_file): try: txt = StringIO() file = open(path_to_file, 'rb') #read binary file parse = PDFParser(file) document = PDFDocument(parse) manage = PDFResourceManager() convert = TextConverter(manage, txt, laparams = LAParams()) interpret = PDFPageInterpreter(manage, convert) for page in PDFPage.create_pages(document): interpret.process_page(page) file.close() return(txt.getvalue()) except: return('error')
def get_layout_elements(content): '''Take content of pdf and return list of text in order that it occurs''' rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) # Create a PDF device object. parser = PDFParser(io.BytesIO(content)) # Create a PDF page aggregator object. page = next(PDFPage.create_pages(PDFDocument(parser))) interpreter.process_page(page) # receive the LTPage object for the page. return [ child.get_text() for child in device.get_result() if hasattr(child, 'get_text') ]
def pages(self): if hasattr(self, "_pages"): return self._pages doctop = 0 pp = self.pages_to_parse self._pages = [] for i, page in enumerate(PDFPage.create_pages(self.doc)): page_number = i + 1 if pp is not None and page_number not in pp: continue p = Page(self, page, page_number=page_number, initial_doctop=doctop) self._pages.append(p) doctop += p.height return self._pages
def pdf2text_all(stream): parser = PDFParser(stream) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed resmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resmgr, laparams=laparams) interpreter = PDFPageInterpreter(resmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) for obj in device.get_result(): if isinstance(obj, (LTTextBox, LTTextLine)): yield obj.get_text()
def convert_pdf_to_string(self, txt_edit, path): output_string = StringIO() txt_edit.delete(1.0, tk.END) with open(path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() interpreter = PDFPageInterpreter( rsrcmgr, TextConverter(rsrcmgr, output_string, laparams=LAParams())) pages = PDFPage.create_pages(doc) for page in pages: interpreter.process_page(page) txt_edit.insert(tk.END, output_string.getvalue())
def convert_pdf_to_txt(content): try: pdf = io.BytesIO(content.content) except: pdf = io.BytesIO(content) parser = PDFParser(pdf) document = PDFDocument(parser, password=None) # this fails write_text = '' for page in PDFPage.create_pages(document): interpreter.process_page(page) write_text += retstr.getvalue() #write_text = write_text.join(retstr.getvalue()) # Process all pages in the document text = str(write_text) return text
def PDF_to_TXT_regex2(title): #print("\n\n ~~~~~~~~ \n\n ~~~~~~~~ \n\n") print("Title: {}".format(title)) with open(title, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) print(output_string.getvalue())
def _parse_pages(doc, images_folder): """With an open PDFDocument object, get the pages and parse each one [this is a higher-order function to be passed to with_pdf()]""" rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for this page layout = device.get_result() # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc. text_content.append(parse_lt_objs( layout, (i + 1), images_folder)) return text_content
def convert_pdf_to_string(file_path): output_string = StringIO() laparams = LAParams() laparams.all_texts = True with open(file_path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return (output_string.getvalue())