def extract_pdf(file): """ extract the string content of a pdf """ parser = PDFParser(file) document = PDFDocument(parser) document.initialize("") if not document.is_extractable: return -1 rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = 'utf-8' device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) content = retstr.getvalue() return content
def with_pdf(pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, "rb") # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # connect the parser and document objects parser.set_document(doc) # supply the password for initialization doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print >>sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def with_pdf(pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # connect the parser and document objects parser.set_document(doc) # supply the password for initialization doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def parse(pdf_path): print(pdf_path) return fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTImage): # 图片对象 num_image += 1 if isinstance(x, LTCurve): # 曲线对象 num_curve += 1 if isinstance(x, LTFigure): # figure对象 num_figure += 1 if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 with open(r'test.doc', 'a', encoding='utf-8') as f: # 生成doc文件的文件名及路径 results = x.get_text() f.write(results) f.write('\n') print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image, '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal)
def pdf2metadata(fp): parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() #print metadata # The raw XMP metadata return doc.info # The "Info" metadata
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc))) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def loadPDF(library, file_name): """adds a paper to the library""" fp = open(file_name, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" document.initialize(password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "CANT" # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] authors = [] #list of authors citations = [] #list of authors that have been cited #pages_length = sum(1 for page in document.get_pages()) for ii, page in enumerate(PDFPage.create_pages(document)): print '---------------------------------------------------------------------------------------------------' print "page number {}".format(ii) interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for jj, lt_obj in enumerate(layout._objs): if jj>3: break if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): cur_line = lt_obj.get_text().encode('ascii', 'ignore') match = pattern_ignore.match(cur_line) if match is None and len(cur_line)<200: print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC else: print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC else: print "PICTURE" break paper_title = file_name paper = library.getPaper(paper_title) paper.addAuthorIds(authors) paper.addCitationIds(citations)
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda: {}) for child in self.cur_item._objs: #<-- changed if isinstance(child, LTChar): (_, _, x, y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def proc(self, pdfFp): """Get meta-data as available from a PDF document""" parser = PDFParser(pdfFp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() self.info = doc.info if 'Metadata' in doc.catalog: self.metadata = xmp_to_dict( resolve1(doc.catalog['Metadata']).get_data() ) self.raw_doc = pdfFp.getvalue()
def getDocumentInfoAndAnnotations(pdfFile): logger.info("Parsing pdf file " + pdfFile); # Open PDF file. fp = open(pdfFile, 'rb'); docInfo = None; docAnnotations = []; # Create a PDF parser object associated with the file object. parser = PDFParser(fp); # Create a PDF document object that stores the document structure. document = PDFDocument(parser); # Supply the password for initialization. # (If no password is set, give an empty string.) document.initialize(''); # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager(); # Create a PDF device object. device = PDFDevice(rsrcmgr); # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device); # Process each page contained in the document. pageNum = 0; for page in PDFPage.create_pages(document): pageNum += 1; interpreter.process_page(page); if(page.annots): try: if isinstance( page.annots, list ): annots = page.annots; else: annots = page.annots.resolve(); for annot in annots: if isinstance( annot, PDFObjRef ): annot = annot.resolve(); if(annot.has_key('Subj')): if(annot['Subj'] == 'Sticky Note' and docInfo == None): logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']); docInfo = annot['Contents']; elif(annot['Subj'] == 'Comment on Text'): logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']); contents = annot['Contents']; docAnnotations.append(str(pageNum) + ':' + contents); else: logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']); except Exception, e: logger.error("error getting annotation"); logger.exception(e); # move file to error os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));
def load_document(self, _file, password=""): """turn the file into a PDFMiner document""" log.info("loading document...") parser = module_parser(_file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if not doc.is_extractable: raise ValueError("PDF text extraction not allowed") return doc
def pdf_from_resource(resource): """ Builds PDF mining objects from input data. This function attempts to open a PDF file for processing. """ parser = PDFParser(resource) document = PDFDocument() parser.set_document(document) document.set_parser(parser) document.initialize() return document
def Parse_PDF(self): def parse_lt_objs (lt_objs, page_number, text=[]): """Iterate through the list of LT* objects and capture the text or image data contained in each""" text_content = [] page_text = {} for lt_obj in lt_objs: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): # text, so arrange is logically based on its column width text_content.append(lt_obj.get_text()) elif isinstance(lt_obj, LTFigure): # LTFigure objects are containers for other LT* objects, so recurse through the children text_content.append(parse_lt_objs(lt_obj, page_number, text_content)) for k, v in sorted([(key,value) for (key,value) in page_text.items()]): # sort the page_text hash by the keys (x0,x1 values of the bbox), # which produces a top-down, left-to-right sequence of related columns text_content.append(''.join(v)) return '\n'.join(text_content) fp = open( self.filePath, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) try: document.initialize('') except: pass rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] i = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() self.text_content.append(parse_lt_objs(layout, (i+1)).strip()) i += 1 return self.text_content
def check_pdf_password(pdf, password): fp = open(pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) try: doc.initialize(password) if doc.is_extractable: print '' print 'The PDF Password Is:' + password return True else: print 'exception' return False except: print '\r', return False
def extract_first_jpeg_in_pdf(fstream): """ Reads a given PDF file and scans for the first valid embedded JPEG image. Returns either None (if none found) or a string of data for the image. There is no 100% guarantee for this code, yet it seems to work fine with most scanner-produced images around. More testing might be needed though. Note that in principle there is no serious problem extracting PNGs or other image types from PDFs, however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable for PDFMiner. :param fstream: Readable binary stream of the PDF :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed. """ parser = PDFParser(fstream) if PY2: document = PDFDocument(parser) else: document = PDFDocument() parser.set_document(document) document.set_parser(parser) document.initialize('') rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) if PY2 else document.get_pages() for page in pages: interpreter.process_page(page) layout = device.result for el in layout: if isinstance(el, LTFigure): for im in el: if isinstance(im, LTImage): # Found one! st = None try: imdata = im.stream.get_data() except: # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well... imdata = im.stream.get_rawdata() if imdata is not None and imdata.startswith( b'\xff\xd8\xff\xe0'): return imdata return None
def convert_file(pdf_file, file_name): parser = PDFParser(pdf_file) pdf = PDFDocument(parser) pdf.initialize("") if not pdf.is_extractable: raise PDFPage.PDFTextExtractionNotAllowed("Document does not allow text extraction: " + file_name) resource = PDFResourceManager() laparams = LAParams() output = StringIO.StringIO() device = TextConverter(resource, output, codec="utf-8", laparams=laparams) interpreter = PDFPageInterpreter(resource, device) for page in PDFPage.create_pages(pdf): interpreter.process_page(page) return output.getvalue()
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(PDFPage.create_pages(doc)) ) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level,title,dest,a,se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
class PDF(list): def __init__(self, file, password='', just_text=1): self.parser = PDFParser(file) self.doc = PDFDocument(self.parser) self.parser.set_document(self.doc) self.doc.initialize(password) if self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO()) self.interpreter = PDFPageInterpreter(self.resmgr, self.device) for page in PDFPage.get_pages(file, [], maxpages=0, password='', caching=True, check_extractable=True): self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup() def _cleanup(self): """ Frees lots of non-textual information, such as the fonts and images and the objects that were needed to parse the PDF. """ del self.device del self.doc del self.parser del self.resmgr del self.interpreter def text(self, clean=True): """ Returns the text of the PDF as a single string. Options: :clean: Removes misc cruft, like lots of whitespace. """ if clean: return utils.normalise_whitespace(''.join(self)) else: return ''.join(self)
def PDFCreationDate(self): if self.file.endswith(".pdf"): fp = open(self.file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() cdate = doc.info[0]['CreationDate'] if isinstance(cdate, str): date_format = date(int(cdate[2:6]), int(cdate[6:8]), int(cdate[8:10])) else: date_format = None print "No Creation Date for " + self.file return date_format else: "The file doesn't appear to be a PDF." return None
def __init__(self, *args, **kwargs): super(AccountRIB, self).__init__(*args, **kwargs) self.parsed_text = '' try: try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter except ImportError: self.logger.warning('Please install python-pdfminer to get IBANs') else: parser = PDFParser(BytesIO(self.doc)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() out = BytesIO() device = TextConverter(rsrcmgr, out) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.create_pages(doc) else: doc.initialize() pages = doc.get_pages() for page in pages: interpreter.process_page(page) self.parsed_text = out.getvalue()
def __init__(self, *args, **kwargs): super(AccountRIB, self).__init__(*args, **kwargs) self.parsed_text = b'' try: try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter except ImportError: self.logger.warning('Please install python-pdfminer to get IBANs') else: parser = PDFParser(BytesIO(self.doc)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() out = BytesIO() device = TextConverter(rsrcmgr, out) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.create_pages(doc) else: doc.initialize() pages = doc.get_pages() for page in pages: interpreter.process_page(page) self.parsed_text = out.getvalue()
def getAbs(pdfFile=None): print pdfFile fp = open(pdfFile, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize() # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # The Abstract is usually in the first page # Extract the text between "abstract" and "introduction" pagenos = [0] for (pageno, page) in enumerate(PDFPage.create_pages(doc)): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) layout = device.get_result() flag = False for x in layout: if (isinstance(x, LTTextBox)): if (re.search('introduction', x.get_text(), re.IGNORECASE) or re.search('Categories and Subject Descriptors', x.get_text(), re.IGNORECASE)): break if (re.search('abstract', x.get_text(), re.IGNORECASE)): flag = True if flag: print(x.get_text()) break
def extract_text(data): try: try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter except ImportError: raise ImportError('Please install python3-pdfminer to parse PDF') else: parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if sys.version_info.major == 2: out = BytesIO() else: out = StringIO() device = TextConverter(rsrcmgr, out) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.create_pages(doc) else: doc.initialize() pages = doc.get_pages() for page in pages: interpreter.process_page(page) return out.getvalue()
class PDF(list): def __init__(self, file, password='', just_text=1): self.parser = PDFParser(file) self.doc = PDFDocument(self.parser) self.parser.set_document(self.doc) self.doc.initialize(password) if self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO()) self.interpreter = PDFPageInterpreter( self.resmgr, self.device) for page in PDFPage.create_pages(self.doc): self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup() def _cleanup(self): """ Frees lots of non-textual information, such as the fonts and images and the objects that were needed to parse the PDF. """ del self.device del self.doc del self.parser del self.resmgr del self.interpreter def text(self, clean=True): """ Returns the text of the PDF as a single string. Options: :clean: Removes misc cruft, like lots of whitespace. """ if clean: return utils.normalise_whitespace(''.join(self)) else: return ''.join(self)
def parse(): # rb以二进制读模式打开本地pdf文件 fn = open('G:/机器学习1/gg.pdf', 'rb') # 创建一个pdf文档分析器 parser = PDFParser(fn) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 resource = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) # 创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) # 使用聚合器get_result()方法获取内容 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: # 判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): print(out.get_text()) with open('test.txt', 'a') as f: f.write(out.get_text() + '\n')
def parse(InputPath, OutputPath): # rb以二进制读模式打开本地pdf文件 fn = open(InputPath, 'rb') # 创建一个pdf文档分析器 parser = PDFParser(fn) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始密码doc.initialize("lianxipython") # 如果没有密码,就创建一个空的字符串 doc.initialize(" ") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器 resource = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器,用于读取文档对象 device = PDFPageAggregator(resource, laparams=laparams) # 创建解释器,对文档编码,解释成python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页内容 # doc.get_pages()获取page列表 pdfStr = '' for page in doc.get_pages(): # 利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) # 使用聚合器get_result()方法获取内容 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出来的各种对象 for out in layout: # 判断是否含有get_text()方法,获取我们想要的文字 if (isinstance(out, LTTextBoxHorizontal)): pdfStr = pdfStr + out.get_text() + '\n' f = open(OutputPath, 'wb') f.write(pdfStr.encode())
def extract_text(data): try: try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter except ImportError: raise ImportError('Please install python-pdfminer to parse PDF') else: parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() out = BytesIO() device = TextConverter(rsrcmgr, out) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.create_pages(doc) else: doc.initialize() pages = doc.get_pages() for page in pages: interpreter.process_page(page) return out.getvalue()
def pdf_to_string(pdf_file): fp = open(pdf_file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() # Configuração das margens laparams = LAParams() laparams.line_margin = 0.3 laparams.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: print(lt_obj)
def parse_pdf(path, output_path): with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, boxes_flow=2.0, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() with open(output_path, "w", encoding="utf-8") as f: f.write(extracted_text)
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice path = '/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf' # Open a PDF file. fp = file(path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) document.initialize(password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page)
def get_pdf_rows(data, miner_layout=True): """ Takes PDF file content as string and yield table row data for each page. For each page in the PDF, the function yields a list of rows. Each row is a list of cells. Each cell is a list of strings present in the cell. Note that the rows may belong to different tables. There are no logic tables in PDF format, so this parses PDF drawing instructions and tries to find rectangles and arrange them in rows, then arrange text in the rectangles. External dependencies: PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html). """ try: from pdfminer.pdfparser import PDFParser, PDFSyntaxError except ImportError: raise ImportError('Please install python-pdfminer') try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if miner_layout: device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) else: device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.get_pages(BytesIO(data), check_extractable=True) else: doc.initialize() pages = doc.get_pages() if LOGGER.isEnabledFor(DEBUGFILES): import tempfile import PIL.Image as Image import PIL.ImageDraw as ImageDraw import random path = tempfile.mkdtemp(prefix='pdf') for npage, page in enumerate(pages): LOGGER.debug('processing page %s', npage) interpreter.process_page(page) page_layout = device.get_result() texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], []) LOGGER.debug('found %d text objects', len(texts)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for t in texts: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color) draw.text((t.x0, t.y0), t.text.encode('utf-8'), color) fpath = '%s/1text-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) if not miner_layout: texts.sort(key=lambda t: (t.y0, t.x0)) # TODO filter ltcurves that are not lines? # TODO convert rects to 4 lines? lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))] LOGGER.debug('found %d lines', len(lines)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for l in lines: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color) fpath = '%s/2lines-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) lines = list(uniq_lines(lines)) LOGGER.debug('found %d unique lines', len(lines)) rows = build_rows(lines) LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for r in rows: for b in r: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) fpath = '%s/3rows-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) textrows = arrange_texts_in_rows(rows, texts) LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for row, trow in zip(rows, textrows): for b, tlines in zip(row, trow): color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color) fpath = '%s/4cells-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) yield textrows device.close()
def get_pdf_rows(data, miner_layout=True): """ Takes PDF file content as string and yield table row data for each page. For each page in the PDF, the function yields a list of rows. Each row is a list of cells. Each cell is a list of strings present in the cell. Note that the rows may belong to different tables. There are no logic tables in PDF format, so this parses PDF drawing instructions and tries to find rectangles and arrange them in rows, then arrange text in the rectangles. External dependencies: PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html). """ try: from pdfminer.pdfparser import PDFParser, PDFSyntaxError except ImportError: raise ImportError('Please install python-pdfminer') try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if miner_layout: device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) else: device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.get_pages(BytesIO(data), check_extractable=True) else: doc.initialize() pages = doc.get_pages() for npage, page in enumerate(pages): interpreter.process_page(page) page_layout = device.get_result() texts = sum([ list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar)) ], []) if not miner_layout: texts.sort(key=lambda t: (t.y0, t.x0)) lines = list( uniq_lines( lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine)))) boxes = build_rows(lines) textrows = arrange_texts_in_rows(boxes, texts) yield textrows device.close()
def pdf2csv(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) print doc # Connect the parser and document objects. # parser.set_document(doc) # doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. # if not doc.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layout = device.get_result() for pageno, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #import code; code.interact(local=locals()); hlines = [] vlines = [] for i in layout: if not type(i) in (LTRect, LTLine): continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) hlines = filterclose(sorted(set(hlines))) vlines = filterclose(sorted(set(vlines))) print hlines print vlines print(layout.width, layout.height) i = 0 im = Image.new('1', (int(layout.width), int(layout.height))) draw = ImageDraw.Draw(im) while (i < len(vlines) - 1): if not vlines[i + 1] - vlines[i] > 5: i = i + 1 continue j = 0 while (j < len(hlines) - 1): if not hlines[j + 1] - hlines[j] > 5: j = j + 1 continue draw.rectangle([(int(hlines[j]), int(vlines[i])), (int(hlines[j + 1]), int(vlines[i + 1]))], outline=1) j = j + 1 i = i + 1 del draw fp = open("out%s.png" % pageno, 'wb') im.save(fp, "PNG") fp.close()
import json from itertools import groupby from collections import Counter from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTChar parser = PDFParser(open('announcer.pdf', 'rb')) document = PDFDocument(parser) document.initialize() rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) class Entry(object): KEYS = ('dept', 'sect', 'id', 'title', 'room', 'staff', 'space', 'block', 'code') def __init__(self, *row): self.__dict__.update(dict(zip(Entry.KEYS, row))) self.sect = int(self.sect) self.space = int(self.space)
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator fp = open("Lista_samurai_x.pdf", "rb") parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparamns = LAParams() laparamns.line_margin = 0.3 laparamns.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparamns=laparamns) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for ltobject in layout: print(ltobject.get_text())
#coding=utf-8 from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice import LAParams import PDFPageAggregator fp = open('/home/zzq/learngit/pdf_document/php.pdf')#打开文件 parser=PDFParser(fp)#解析器 doc =PDFDocument()#文档 doc.set_parser(parser)#设置解析器 doc.initialize("")#初始化 resource=PDFResourceManager()#资源管理器 laparams=LAParams()#参数分析期 #聚合器 device=PDFPageAggregator() #页面解析器 interpreter=PDFPageInterpreter(resource,device) for page in doc.get_pages(): interpreter.process_page(page) layout=device.get_result() for out in layout: print out.get_text()
import sys from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import resolve1 filename = sys.argv[1] fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize() fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) name, value = field.get('T'), field.get('V') print('{0}: {1}'.format(name, value))
def createFromPdfminer(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfpage import PDFPage from pdfminer.pdftypes import PDFObjRef fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize() assert doc.is_extractable result = PDFInfos() result._metaInfo = dict((key, str.decode(value, 'utf-16') if value.startswith('\xfe\xff') else value) for key, value in doc.info[0].items() if isinstance(value, basestring)) pageids = [page.pageid for page in PDFPage.create_pages(doc)] result._pageCount = len(pageids) def get(obj, attr = None): """Resolve PDFObjRefs, otherwise a no-op. May also perform dict lookup, i.e. get(obj, 'A') is roughly the same as get(obj)['A'].""" while isinstance(obj, PDFObjRef): obj = obj.resolve() if attr is not None: return get(obj[attr]) return obj def actionToPageIndex(action): assert get(action, 'S').name == 'GoTo' name = get(action, 'D') # resolve "named destination": dest = get(doc.get_dest(name)) return destToPageIndex(dest) def destToPageIndex(dest): dest = get(dest) if isinstance(dest, dict): assert dest.keys() == ['D'], repr(dest) dest = get(dest, 'D') # destinations contain the page as first element, # the rest concerns the ROI / zoom state (various modes there): return pageids.index(dest[0].objid) try: result._outline = [(level, title, actionToPageIndex(a) if a else destToPageIndex(dest)) for level, title, dest, a, se in doc.get_outlines()] except PDFNoOutlines: result._outline = None result._pageInfos = [] # get annotations (links): for page in PDFPage.create_pages(doc): pageLinks = [] for anno in get(page.annots) or []: anno = get(anno) rect = numpy.array(get(anno, 'Rect'), float).reshape((2, 2)) if 'Dest' in anno: # 'Dest' is the older (more compatible) way to # specify links dest = get(anno, 'Dest') pageLinks.append((rect, destToPageIndex(dest))) elif 'A' in anno: # actions are much more general and include 'GoTo' # (with viewport spec.) with variants for remote # and embedded documents action = get(anno, 'A') subType = get(action, 'S').name if subType == 'GoTo': pageLinks.append((rect, actionToPageIndex(action))) elif subType == 'URI': #assert sorted(action.keys()) == ['S', 'Type', 'URI'] link = get(action, 'URI') if link.startswith('file:'): # resolve relative pathname w.r.t. PDF filename: link = 'file:' + os.path.join(os.path.dirname(filename), link[5:]) pageLinks.append((rect, link)) pageBox = numpy.array([page.mediabox], float).reshape((2, 2)) result._pageInfos.append(PDFPageInfos(links = pageLinks, pageBox = pageBox)) # extract all named destinations: def extract_names(dests, result = None): if result is None: result = {} if 'Names' in dests: it = iter(get(dests, 'Names')) for name, ref in zip(it, it): result[name] = destToPageIndex(ref) if 'Kids' in dests: for kid in get(dests, 'Kids'): extract_names(get(kid), result) return result try: result._names = extract_names(get(doc.catalog['Names'], 'Dests')) except KeyError: pass return result
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice path = '/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf' # Open a PDF file. fp = file(path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) document.initialize(password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page)
class SreenplayPdfProcessor: screenplay_object_list = [] char_margin = False def __init__(self, pdf): self.document = pdf #initialize parsing parameters self.file_pointer = open(self.document.name, 'rb') self.parser = PDFParser(self.file_pointer) self.pdf_document = PDFDocument(self.parser) self.pdf_document.initialize() # set resaource management self.resource_manager = PDFResourceManager() self.pdf_device = PDFPageAggregator(self.resource_manager, laparams=LAParams()) #set interpreter self.interpreter = PDFPageInterpreter(self.resource_manager, self.pdf_device) #self.file_pointer.close() def get_pages(self, *args): screenplay_object_list = [] # a list of strings, each representing text collected from each page of the doc counter = 0 for page in PDFPage.create_pages(self.pdf_document): self.interpreter.process_page(page) # receive the LTPage object for this page processed_pdf_page = self.pdf_device.get_result() # processed_pdf_page is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc. screenplay_object_list.append(self.get_screenplay_element_objects(processed_pdf_page, (counter+1), '/tmp/')) counter+=1 return screenplay_object_list def is_left_justified(self, page_object): return page_object.x0 == 108 or page_object.x0 == 90 def is_character_margin(self, pdf_page_object): #print pdf_page_object margin_list = [252,] if pdf_page_object.x0 in margin_list: self.char_margin = True return True else: self.char_margin = False def is_dialogue_margin(self, pdf_page_object): margin_list = [180,] if pdf_page_object.x0 in margin_list: return True return False def remove_block_from_holder(self, item): if type(item) == tuple: item = " ".join(item) self.text_block_holder = self.text_block_holder.replace(item, "") def add_item_to_screenplay_object_list(self, item_type, func): item = func() if item: self.remove_block_from_holder(item) self.screenplay_object_list.append(dict(name=item_type, content=item)) def get_screenplay_element_objects (self, pdf_page_objects, page_number, images_folder): """Iterate through the list of LT* objects and capture the text or image data contained in each""" text_content = [] for pdf_page_object in pdf_page_objects: if isinstance(pdf_page_object, LTTextBox): self.text_block_holder = pdf_page_object.get_text() if self.text_block_holder.strip() != "": if self.is_left_justified(pdf_page_object): #ACTION and HEADINGS self.add_item_to_screenplay_object_list("slug", self.get_heading) self.add_item_to_screenplay_object_list("action", self.get_action) elif self.is_character_margin(pdf_page_object): #CHARACTER and DIALOGUE self.add_item_to_screenplay_object_list("character", self.get_character_name) self.add_item_to_screenplay_object_list("parentheses", self.get_parentheses) elif self.is_dialogue_margin(pdf_page_object): self.add_item_to_screenplay_object_list("dialogue", self.get_dialogue) elif pdf_page_object.x0 > 400: # TRANSITONS continue return self.screenplay_object_list def get_element(self, search_string): element = re.findall(search_string, self.text_block_holder) try: element = element[0] except IndexError: pass if element: return element return None def get_character_name(self): character_name = self.get_element("([^<>a-z\s][^...][^a-z:\!\?]*?[^a-z\(\!\?:,][\s]??)\n{1}") if character_name: #character_name = self.get_element("([A-Z]{2,})") #character_name = self.get_element("([^<>a-z\s].[A-Z:\!\?][^...]([A-Z:\!\?]| ){4,})") if self.char_margin: #character_name = self.get_element("([^<>a-z\s].[A-Z:\!\?][^...]([A-Z:\!\?]| ){4,})") #print character_name return character_name else: return None def get_parentheses(self): return self.get_element("(\([^<>]*?\)[\s]??)") def get_dialog(self): return self.get_element(".*|.*\n{0,1}(.+?)\n") def get_heading(self): heading = self.get_element("((INT|EXT|[^a-zA-Z0-9]EST)([\.\-\s]+?).+)") try: heading = heading[0] except TypeError: pass return heading def get_action(self): return self.text_block_holder def get_dialogue(self): return self.text_block_holder
def pdf2csv(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) print doc # Connect the parser and document objects. # parser.set_document(doc) # doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. # if not doc.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layout = device.get_result() for pageno, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #import code; code.interact(local=locals()); hlines=[] vlines=[] for i in layout: if not type(i) in (LTRect, LTLine): continue hlines.append(int(i.x0)) hlines.append(int(i.x1)) vlines.append(int(layout.height - i.y0)) vlines.append(int(layout.height - i.y1)) hlines=filterclose(sorted(set(hlines))) vlines=filterclose(sorted(set(vlines))) print hlines print vlines print (layout.width, layout.height) i=0 im = Image.new('1', (int(layout.width), int(layout.height))) draw = ImageDraw.Draw(im) while(i<len(vlines)-1): if not vlines[i+1]-vlines[i]>5: i=i+1 continue j=0 while(j<len(hlines)-1): if not hlines[j+1]-hlines[j]>5: j=j+1 continue draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1) j=j+1 i=i+1 del draw fp=open("out%s.png" % pageno,'wb') im.save(fp,"PNG") fp.close()
from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams import sys""" # Open a PDF file. fp = open(sys.argv[1], 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) document.initialize('') # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF device object. ######device = PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page)
import json from itertools import groupby from collections import Counter from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTChar parser = PDFParser(open('announcer.pdf', 'rb')) document = PDFDocument(parser) document.initialize() rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) class Entry(object): KEYS = ('dept', 'sect', 'id', 'title', 'room', 'staff', 'space', 'block', 'code') def __init__(self, *row): self.__dict__.update(dict(zip(Entry.KEYS, row))) self.sect = int(self.sect) self.space = int(self.space) self.block = int(self.block)