def getPageLayouts(f1): '''Takes a pdf file object, f1, extracts the text-like objects, and returns''' try: '''The parser and doc pair for a "pipe" of sorts''' with open(fpath, 'rb') as f1: parser = PDFParser(f1) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pss_wd) # can we extract text? if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_layouts = [] for page in doc.get_pages(): ''' I *think* we're actually calling on fp here, and not some stored data; the idea is that .pdf files are "too big and complicated" to load all at once, so why not just parse what you need when you need it? ''' interpreter.process_page(page) # receive the LTPage object for the page page_layouts.append(device.get_result()) except IOError: raise IOError, "issue with loading file, please try again" finally: f1.close() return page_layouts
def read_invoice_pdfminer3k(pdfFile): fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. invoice_text = "" for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): invoice_text += lt_obj.get_text() # Extract client info from the string extracted from pdf client = extract_info(invoice_text, client_start, client_end) print("client :" + client) # Extract invoice no from the pdf file name invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end) print("invoice no :" + invoice_no) # Pass the client info and invoice no to the method which writes to excel file write_excel(client, invoice_no)
def ParseAllPages(self, filepath): # Open a PDF file. self.filepath = filepath fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page)
def WithPdf(self, pdfdoc, password, fn, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdfdoc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization if password: self.password = password doc.initialize(self.password) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def create_pages(self): """Apply parsing function, returning the results""" from public_project.models import Page # create a parser object associated with the file object parser = PDFParser(self.pdf_file) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization pdf_pwd = '' doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result doc_pages = self._parse_pages(doc) i = 1 for doc_page in doc_pages: page = Page( document=self.document, number=i, content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'), ) page.save() i = i + 1
def pdf_function(pdf_doc, password='', *args, **kwargs): result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization doc.initialize(password) if doc.is_extractable: # apply the function and return the result result = function(doc, *args, **kwargs) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def pdf_to_csv(filename): # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def _GetFromPdf(self,pdf): ''' 参考文档http://www.unixuser.org/~euske/python/pdfminer/programming.html ''' pass fp = open(pdf, 'rb') #用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个 PDF 文档 doc = PDFDocument(parser) # 连接分析器 与文档对象 parser.set_document(doc) # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for x in layout: if(isinstance(x, LTTextContainer)): print x.get_text() pass
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None): if len(args) > 1: prefix = fileOrUrl + ':' fp = None if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'): request = urllib2.Request(fileOrUrl) if basicauth: request.add_header('Authorization', 'Basic ' + basicauth) fobj = urllib2.urlopen(request) pdfdata = fobj.read() fobj.close() fp = StringIO.StringIO(pdfdata) else: fp = open(fileOrUrl, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() fp.close() if textmode: for obj in doc.info: for (name, val) in obj.iteritems(): print '{0}:{1}={2}'.format( fileOrUrl, name, val ) else: val = doc.info if type(val) is list and len(val) == 1: val = val[0] print prefix + str(val)
def getData(self): doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) except: return "error" parser.close() fp.close() #try: # metadata = resolve1(doc.catalog['Metadata']) # return "ok" #except: # print "[x] Error in PDF extractor, Metadata catalog" try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info if self.raw == None: return "Empty metadata" else: return "ok" except Exception,e: return e print "\t [x] Error in PDF extractor, Trailer Info"
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 parser= PDFParser(f) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) for page in doc.get_pages(): retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_num += 1 interpreter.process_page(page) data = retstr.getvalue() self.parse_page(fpath, bytes(data,'UTF-8'), page_num) retstr.close() self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) #doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) #doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: pass #raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1, boxes_flow=0.1, detect_vertical=False, all_texts=False) laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def parse_pdf(pdf_url): remote_file = urllib.request.urlopen(pdf_url).read() memory_file = io.BytesIO(remote_file) parser = PDFParser(memory_file) doc = PDFDocument() parser.set_document(doc) #Warning sometimes, error in pdf? doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ret = [] # Process each page contained in the document. for pageIdx, page in enumerate(doc.get_pages()): ret.append([]) interpreter.process_page(page) layout = device.get_result() for idx, lt_obj in enumerate(layout): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if len(lt_obj.get_text().strip()) > 0: ret[pageIdx].append((lt_obj.get_text().splitlines())) return ret
def __init__(self, filepath): self.doc = PDFDocument() # the underlying pdf document fp = open(filepath, 'rb') parser = PDFParser(fp) parser.set_document(self.doc) self.doc.set_parser(parser) self.doc.initialize()
def getData(fileName): doc = PDFDocument() fp = file(fileName, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) except: return "error" parser.close() fp.close() try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) metadata=info if metadata == None: return "Empty metadata" else: if metadata.has_key('Author'): print("Author "+metadata['Author']) if metadata.has_key('Company'): print("Company "+metadata['Company']) if metadata.has_key('Producer'): print("Producer "+metadata['Producer']) if metadata.has_key('Creator'): print("Creator "+metadata['Creator']) except Exception,e: print "\t [x] Error in PDF extractor" return e
def get_toc(self): fp = open(self.pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') # title if doc.info: metadict = doc.info[0] if 'Title' in metadict.keys(): self.title = normalize_title(metadict['Title']) # level 1 of toc try: outlines = doc.get_outlines() toc = list() select_level = self.get_level1(outlines) except: return None for (level,title,dest,a,se) in doc.get_outlines(): if level==select_level: toc.append(normalize_toc_item(title)) return toc
def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ file_pointer = open(self.path, 'rb') parser = PDFParser(file_pointer) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() metadata = Metadata() for i in doc.info: metadata.add(i) if 'Metadata' in doc.catalog: xmp_metadata = resolve1(doc.catalog['Metadata']).get_data() xmp_dict = xmp_to_dict(xmp_metadata) #Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") file_pointer.close() self.metadata = metadata return metadata
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def prepare(self): self.doc = PDFDocument() self.source = open(self.filename, "rb") parser = PDFParser(self.source) parser.set_document(self.doc) self.doc.set_parser(parser) self.doc.initialize("")
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages = dict(enumerate(doc.get_pages())) for num, page in pages.iteritems(): if pagenums and (num not in pagenums): continue interpreter.process_page(page) if maxpages and maxpages <= num + 1: break return pages
def parse (self): fp = file(self.pdf, 'rb') parser = PDFParser(fp, dbg=self.debug) doc = PDFDocument(parser, dbg=self.debug) #extract blob of data after EOF (if it exists) if doc.found_eof and doc.eof_distance > 3: self.bin_blob = parser.read_from_end(doc.eof_distance) res = '<pdf>' visited = set() #keep track of the objects already visited for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue if objid == 21 or objid == 67: print objid visited.add(objid) try: obj = doc.getobj(objid) res += '<object id="' + str(objid) + '">\n' res += self.dump(obj) res += '\n</object>\n\n' except PDFObjectNotFound as e: mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096) mal_obj = mal_obj.replace('<', '0x3C') res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj) self.takenote(self.malformed, 'objects', objid) except Exception as e: res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message) fp.close() res += self.dumptrailers(doc) res += '</pdf>' self.xml=res self.errors = doc.errors self.bytes_read = parser.BYTES return
def pdf_isvalid(filelike): ''' returns True if valid pdf, else False @param filelike: filelike object, seekable ''' logger = logging.getLogger() isvalid = False filelike.seek(0) if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC: return False else: filelike.seek(0) try: parser = PDFParser(filelike) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') if doc.is_extractable: isvalid = True except PDFException as excobj: logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj)) isvalid = False filelike.seek(0) return isvalid
def with_pdf (pdf_doc, pdf_pwd, fn, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file if hasattr(pdf_doc, 'read'): fp = pdf_doc else: fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # connect the parser and document objects parser.set_document(doc) # supply the password for initialization #doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass raise return result
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
def extract_text_elements_from_pdf(path, j=nulljob): """Opens a PDF and extract every element that is text based (LTText). """ fp = open(path, 'rb') doc = PDFDocument(caching=True) parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] all_elements = [] enumerated_pages = list(enumerate(doc.get_pages())) progress_msg = "Reading page %i of %i" for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg): interpreter.process_page(page) page_layout = device.get_result() pages.append(Page(page_layout.width, page_layout.height)) textboxes = extract_textboxes(page_layout) elements = [create_element(box) for box in textboxes] merge_oneletter_elems(elements) for i, elem in enumerate(elements): elem.page = pageno elem.order = i all_elements += elements return pages, all_elements
def load( self, open_file ): self.fields = {} self.text= {} # Create a PDF parser object associated with the file object. parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for pgnum, page in enumerate( doc.get_pages() ): interpreter.process_page(page) if page.annots: self._build_annotations( page ) txt= self._get_text( device ) self.text[pgnum+1]= txt
def parse_document(self): self.res = [] # result set self.media_boxes = dict() # media coordinate dictionary self.n = 0 # page count pdf = open(self.pdf, "rb") pdf_parser = PDFParser(pdf) pdf_document = PDFDocument(pdf_parser) la_params = LAParams(detect_vertical=True) if constants.USE_CUSTOM_PDF_PARAMETERS: la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL, line_overlap=constants.DEFAULT_LINE_OVERLAP, line_margin=constants.DEFAULT_LINE_MARGIN, word_margin=constants.DEFAULT_WORD_MARGIN, char_margin=constants.DEFAULT_CHAR_MARGIN, boxes_flow=constants.DEFAULT_BOXES_FLOW) if pdf_document.is_extractable: resource_manager = PDFResourceManager() page_aggregator = PDFPageAggregator(resource_manager, laparams=la_params) page_interpreter = PDFPageInterpreter(resource_manager, page_aggregator) pages = PDFPage.create_pages(pdf_document) for page in pages: page_interpreter.process_page(page) layout = page_aggregator.get_result() crop_box = page.cropbox page_box = page.mediabox self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1], "x1": crop_box[2], "y1": crop_box[3], "x0page": page_box[0], "y0page": page_box[1], "x1page": page_box[2], "y1page": page_box[3]} self.box_id = -1 self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes) self.n += 1 return self.res, self.media_boxes
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() v = string_value(v) if k != 'pages' and v is not None and '<PDFObjRef:' not in v: result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): text = None try: interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, path) except Exception as ex: log.warning("Failed to parse PDF page: %r", ex) if text is None or len(text) < 3: log.info("OCR: %r, pg. %s", path, i + 1) text = _extract_image_page(path, i + 1, languages) result['pages'].append(text) device.close() return result
def get_total(filename): path = open(filename, 'rb') parser = PDFParser(path) document = PDFDocument(parser) temp_total = -1 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) check_total = False for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = str(x.get_text()) if "Subtotal for all regions" in results: check_total = True if check_total: # print("results: " + results) temp_results = re.search(r'(.*)\n', results, re.M | re.I).group(1) temp_results = temp_results.replace(" ", "").replace( "\\n", "") try: temp_num = int(temp_results) if temp_num > temp_total: temp_total = temp_num except ValueError: continue return temp_total
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print >> sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def pdfminer(f): # Open a PDF file. fp = open(f, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. # device = PDFDevice(rsrcmgr) laparams = LAParams(all_texts=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) converter = HTMLConverter(os.path.basename(f)) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() converter.current_page = page converter.render(layout) break # stop after first page. converter.add_features() return converter
def generateFileContent(self): import tempfile import urllib abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157" temporaryFile = tempfile.NamedTemporaryFile() urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name) entries = set() fileObject = open(temporaryFile.name, "rb") parser = PDFParser(fileObject) document = PDFDocument(parser) resourceManager = PDFResourceManager() device = PDFPageAggregator(resourceManager) interpreter = PDFPageInterpreter(resourceManager, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() objects = [ object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve) ] params = LAParams() for line in layout.group_objects(params, objects): text = line.get_text() if u":" in text: entry = text.split(u":")[0] entry = entry.strip() entry = entry.replace(u"..", ".") entries.add(entry) dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n" dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n" dictionary += u"\n" for entry in formatEntriesForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def ParsePDF(): filename = open(pdfpath, 'rb') #以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 parser = PDFParser(filename) # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数 doc = PDFDocument(parser, password='') #检查文件是否允许文本提取 if not doc.is_extractable: print("Not Allowd Extractable") raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器来管理共享资源,#caching = False不缓存 rsrcmgr = PDFResourceManager(caching = False) # 创建一个PDF设备对象 laparams = LAParams() # 创建一个PDF页面聚合对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) #device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 获取page列表list对象, # print(PDFPage.get_pages(doc)) #获取page列表循环遍历列表,每次处理一个page的内容 for page in PDFPage.create_pages(doc): # 接受该页面的LTPage对象 interpreter.process_page(page) # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 layout = device.get_result() for i in layout: if hasattr(i,"get_text") : content = i.get_text().replace(u'\xa0',u'').replace('\n','') document.add_paragraph(content , style=None) break document.save("a.docx") filename.close() return 1
def parse_question_file(question_file_path): text_content = [] with open(question_file_path, 'rb') as question_file: parser = PDFParser(question_file) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): line = x.get_text().decode().strip() if line: text_content.append(line + '\n') return text_content
def get_blurb(): pdfs = glob.glob('/pdfs/*') if not pdfs: print >> sys.stderr, 'NO PDFS' return '', '' pdf = random.choice(pdfs) print >> sys.stderr, 'pdf:', pdf with open(pdf, 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser) assert document.is_extractable rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = list(PDFPage.get_pages(f)) pnum = random.randint(0, len(pages)) interpreter.process_page(pages[pnum]) txt = retstr.getvalue() return pdf.replace('pdfs', 'view') + '#page=' + str(pnum), txt[:100]
def extract_text(in_path, out_path): #https://towardsdatascience.com/pdf-text-extraction-in-python-5b6ab9e92dd files = glob.glob(in_path + '*.pdf') for i in range(len(files)): print(str(i / len(files) * 100)[:4] + "%", end="\r") name = files[i] file_path = in_path + name output_string = StringIO() with open(file_path, 'rb') as infile: parser = PDFParser(infile) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) out_filename = out_path + os.path.basename(name).replace("pdf", "txt") with open(out_filename, 'w') as outfile: outfile.write(output_string.getvalue())
def extract_pages(fp, start=None, end=None): """ extracts LTPage objects from a pdf file slightly modified from: https://euske.github.io/pdfminer/programming.html """ laparams = LAParams() parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed manager = PDFResourceManager() device = PDFPageAggregator(manager, laparams=laparams) interpreter = PDFPageInterpreter(manager, device) for i, page in enumerate(PDFPage.create_pages(document)): if start is not None and end is not None and i < start or i >= end: continue interpreter.process_page(page) yield device.get_result()
def get_problem_page(problem, pdf_path): """ Returns the pdf object belonging to the page of a problem widget Parameters ---------- problem : Problem Problem object in the database of the currently selected problem pdf_path : str Path to the PDF file of the exam for this problem Returns ------- page : PDFPage PDFPage object with information about the current page """ fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) page_number = problem.widget.page return next(itertools.islice(PDFPage.create_pages(document), page_number, page_number + 1))
def parse(Path): parser = PDFParser(Path) #parser的意思是解析器、分析程序 document = PDFDocument(parser) re_list = [] # 判断PDF是否能够解析 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() re_list.append(results) print(re_list) return re_list
def upload_file(): if request.method == 'POST': output_string = StringIO() in_file = request.files["file"] parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) texto = output_string.getvalue() # Gerando mp3 tts = gtts.gTTS(texto, lang="pt-br") mp3_fp = BytesIO() # tts.write_to_fp(mp3_fp) tts.save('mp3_fp.mp3') response = jsonify(message="Simple server is running") response.headers.add("Access-Control-Allow-Origin", "*") return send_file('mp3_fp.mp3', as_attachment=True) else: return "Method POST not found"
def extract_layout_by_page(pdf_path): """ Extracts LTPage objects from a pdf file. """ laparams = LAParams() fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layouts.append(device.get_result()) return layouts
def parse_pdf(self, source_pdf: str = None) -> None: """Parse source PDF into entities which can be used for text searches for example. :param source_pdf: source """ if source_pdf is not None: self.switch_to_pdf_document(source_pdf) source_parser = PDFParser(self.active_fileobject) source_document = PDFDocument(source_parser) source_pages = PDFPage.create_pages(source_document) rsrcmgr = PDFResourceManager() laparams = LAParams( detect_vertical=True, all_texts=True, ) device = RPAConverter(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # # Look at all (nested) objects on each page for _, page in enumerate(source_pages, 0): interpreter.process_page(page) self.rpa_pdf_document = device.close()
def main(fname): with open(fname, 'rb') as fd: parser = PDFParser(fd) doc = PDFDocument(parser) # Check if document is extractable, if not abort if not doc.is_extractable: raise Exception rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) all_txt = "" for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() txt = parse_layout(layout) all_txt += txt #print "Converted text\n", all_txt snip = find_pattern(all_txt, "volunteer recycling", 200) print snip
def __init__(self, stream, pages=None, laparams=None, precision=0.001): self.laparams = None if laparams == None else LAParams(**laparams) self.stream = stream self.pages_to_parse = pages self.precision = precision rsrcmgr = PDFResourceManager() self.doc = PDFDocument(PDFParser(stream)) self.metadata = {} for info in self.doc.info: self.metadata.update(info) for k, v in self.metadata.items(): if hasattr(v, "resolve"): v = v.resolve() if type(v) == list: self.metadata[k] = list(map(decode_text, v)) elif isinstance(v, PSLiteral): self.metadata[k] = decode_text(v.name) elif isinstance(v, bool): self.metadata[k] = v else: self.metadata[k] = decode_text(v) self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
def Pdf2Txt(DataIO,Save_path): #来创建一个pdf文档分析器 parser = PDFParser(DataIO) #创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: #创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager(); #设定参数进行分析 laparams=LAParams(); #创建一个PDF设备对象 #device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams);#创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) #处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page); #接受该页面的LTPage对象 layout=device.get_result() for x in layout: try: if(isinstance(x,LTTextBoxHorizontal)): with open('%s'%(Save_path),'a') as f: f.write(x.get_text().encode('utf-8')+'\n') except: print "Failed!"
def shan_convert(pdf_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: temp_file = pikepdf.open(pdf_path) temp_path = pdf_path[:-4] + "shan_temp" + ".pdf" temp_file.save(temp_path) fp = open(temp_path, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def character_extraction(self, address): # Create a file pointer fp = open(address, 'rb') try: # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, '') # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, (LTTextBox, LTTextLine)): self.fetch_chars(lt_obj) finally: fp.close()
def test_pdf(self): # Test capture library API content = self.capture.pdf(url=server.base_url + self.url) self.check_pdf(content) # Test service: relative and absolute URLs for url in (server.base_url + self.url, '..' + self.url, self.url): result = self.fetch(self.src, params={'url': url}) self.check_filename(result, 'screenshot.pdf') self.check_pdf(result.content) # delay=. After 500ms, page changes text and color to blue # file=. Changes filename result = self.fetch(self.src, params={'url': self.url, 'delay': 600, 'file': 'delay'}) self.check_filename(result, 'delay.pdf') self.assertIn('Blueblock', normalize(get_text(result.content))) # --format and --orientation result = self.fetch(self.src, params={ 'url': self.url, 'format': 'A3', 'orientation': 'landscape'}) parser = PDFParser(io.BytesIO(result.content)) page = next(PDFPage.create_pages(PDFDocument(parser))) self.assertIn([round(x) for x in page.attrs['MediaBox']], ( [0, 0, 1188, 842], # noqa: Chrome uses 1188 x 842 for A3 [0, 0, 1191, 842], # noqa: PhantomJS uses 1191 x 842 for A3 )) # cookie=. The Cookie is printed on the screen via JS result = self.fetch(self.src, params={'url': self.url + '?show-cookie', 'cookie': 'a=x'}) self.assertIn('a=x', normalize(get_text(result.content))) # Cookie: header is the same as ?cookie=. # Old request cookies vanish. Only new ones remain result = self.fetch(self.src, params={'url': self.url + '?show-cookie'}, headers={'Cookie': 'b=z'}) result_text = normalize(get_text(result.content)) self.assertIn('js:cookie=b=z', result_text) self.assertIn('server:cookie=b=z', result_text)
def parse_file(file: Path): with open(file, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) laparams = LAParams() text_boxes = [] # 清理后box列表 if not doc.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) last_out = None for i, page in enumerate(PDFPage.create_pages(doc)): orgi_boxes = [] # 原始box列表 interpreter.process_page(page) layout = device.get_result() for out in layout: if isinstance(out, LTTextBoxHorizontal) and ('猿题库' not in out.get_text()): orgi_boxes.append(out) else: pass # 去除页眉和页尾 cleaned_boxes = orgi_boxes[1:-1] for box in cleaned_boxes: pdf_box = PDFBox(box, i, last_out) text_boxes.append(pdf_box) last_out = pdf_box print('parse end') return text_boxes
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({ "index": page_index, "bounding_box": bounding_box, "labels": labels }) page_index += 1 fp.close() return result
def pdf2txt(self, path): print('解析pdf中...') with open(path, 'rb') as f: praser = PDFParser(f) doc = PDFDocument(praser) # if not doc.is_extractable: # raise PDFTextExtractionNotAllowed pdfrm = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(pdfrm, laparams=laparams) interpreter = PDFPageInterpreter(pdfrm, device) result = '' for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for x in layout: try: if hasattr(x, "get_text"): content = x.get_text() with open( r'E:\pycharm_len\py_learn\learn\office\file\linux_pdf.txt', 'a') as f: try: result += content f.write(content) except Exception as err: print('error_write', err) except Exception as err: print('error', err) print('__________' * 10) print(result)
def pdf2txt(filePath, outPath): manager = PDFResourceManager() codec = 'utf-8' caching = True #创建一个pdf文档分析器,从文件中获取数据 parser = PDFParser(filePath) #创建一个PDF文档对象存储文档结构,保存获取的数据 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: #print("sorry,failed") raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共享资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象,处理页面内容 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理文档中的每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage整个页面对象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open('%s' % (outPath), 'a') as f: #“a”追加写,不会被覆盖;“w”重新写入,w有些文献会出错 #f.write(x.get_text()+ '\n') f.write(( x.get_text().encode("utf-8") + '\n'.encode("utf-8")).decode( "utf-8", "xmlcharrefreplace")) #decode("gbk","ignore"))
def process_pdf(title, path): """ @param title string Title to apply to the document. @param path string Path to the input PDF. @returns DrocerDocument """ output_document = DrocerDocument(title, path) with open(path, 'rb') as pdf_file: # setup pdf reader pdf_parser = PDFParser(pdf_file) pdf_password = '' pdf_document = PDFDocument(pdf_parser, pdf_password) pdf_rsrcmgr = PDFResourceManager() pdf_laparams = LAParams() pdf_device = PDFPageAggregator(pdf_rsrcmgr, laparams=pdf_laparams) pdf_interpreter = PDFPageInterpreter(pdf_rsrcmgr, pdf_device) # process document page_number = 0 for pdf_page in PDFPage.create_pages(pdf_document): page_number += 1 logger.info("processing %s page number %s" % (title, page_number)) output_page = DrocerPage(page_number) pdf_interpreter.process_page(pdf_page) pdf_layout = pdf_device.get_result() box_number = 0 for pdf_obj in pdf_layout: if isinstance(pdf_obj, LTTextBox): box_number += 1 output_box = DrocerBox(page_number, box_number, pdf_obj.x0, pdf_obj.y0, pdf_obj.x1, pdf_obj.y1, pdf_obj.get_text().encode('utf8')) output_page.boxes.append(output_box) else: #logger.debug("non-text object") pass output_document.pages.append(output_page) return output_document
def process_attachment(name: str, data: bytes) -> str: result = "" if name.endswith(".txt"): try: result = data.decode("utf-8") except UnicodeDecodeError: print("unable to decode the given text by 'utf-8'") else: temp_file_path = "./data/temp" with open(temp_file_path, mode='wb') as temp: temp.write(data) if name.endswith(".docx"): result = docx2txt.process(temp_file_path) elif name.endswith(".pdf"): output_string = StringIO() with open(temp_file_path, mode='rb') as pdf: parser = PDFParser(pdf) doc = PDFDocument(parser) resource_manager = PDFResourceManager() device = TextConverter(resource_manager, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) result = output_string.getvalue() elif name.endswith(".pptx"): ppt = Presentation(temp_file_path) for slide in ppt.slides: for shape in slide.shapes: if hasattr(shape, "text"): result += shape.text elif name.endswith(".xlsx"): data = pd.ExcelFile(temp_file_path) for sheet in data.sheet_names: temp = data.parse(sheet) result += str(temp.columns) result += str(data.sheet_names) return result
def get_pdf_metadata(self, pdf_file_stream): metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(pdf_file_stream) pdf_doc = PDFDocument(pdf_parser) author = make_pdf_metadata_str(pdf_doc.info[0]['Author']) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_doc.info[0]['Title']) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_doc.info[0]['ModDate'])) if year and year != '': metadata['year'] = year return metadata
def get_page_num(fpath): """ Get the page number for the current pdf file https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python """ tmp_path = get_tmp_path(fpath) cache_path = "{}.page_num.json".format(tmp_path) if os.path.isfile(cache_path): tmp_dict = load_general(cache_path) return tmp_dict['page_num'] # Open a PDF file. fp = open(fpath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) c = resolve1(document.catalog['Pages'])['Count'] tmp_dict = {'page_num': c} dump_general(tmp_dict, cache_path) return c