def getPageLayouts(f1): '''Takes a pdf file object, f1, extracts the text-like objects, and returns''' try: '''The parser and doc pair for a "pipe" of sorts''' with open(fpath, 'rb') as f1: parser = PDFParser(f1) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pss_wd) # can we extract text? if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_layouts = [] for page in doc.get_pages(): ''' I *think* we're actually calling on fp here, and not some stored data; the idea is that .pdf files are "too big and complicated" to load all at once, so why not just parse what you need when you need it? ''' interpreter.process_page(page) # receive the LTPage object for the page page_layouts.append(device.get_result()) except IOError: raise IOError, "issue with loading file, please try again" finally: f1.close() return page_layouts
def readPdf(file): # Open a PDF file. fp = open(file, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams(line_margin=0.1) pages = [] # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in islice(PDFPage.create_pages(document), 2): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() pages.append(layout) return pages
def convert_pdf_table(pdf_file): pdf_file = open(pdf_file, 'rb') parser = PDFParser(pdf_file) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) table = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() page_table = tabulate_page(layout) header = page_table[0] rows = page_table[1:] for row in rows: row_dict = {} for item, detail in enumerate(row): if detail != '': row_dict[header[item].lower()] = detail table.append(row_dict) return table
def parsepdf(filename): fp = open(filename, 'rb') parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() laparams = LAParams() # Create a PDF device object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. found_randers = False found_aarhus = False _randers = [] headings = [u'Ledige lejligheder\n',u'afd. adresse\n',u'rum m2\n',u'leje \n', u'a\xb4c varme a\xb4c vand\n',u'indskud\n',u'ledig pr.\n',u'bem\xe6rkning\n' ] location_map = OrderedDict() header_ycord = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for obj in layout._objs: # print obj if isinstance(obj,LTTextBoxHorizontal): for o in obj._objs: y0 = o.y0 # print o if isinstance(o,LTTextLineHorizontal) and obj.get_text() not in headings: if y0 not in header_ycord: if y0 in location_map : objs = location_map.get(y0) else: objs = [] string_val = o.get_text().encode('ascii', 'ignore') string_val = string_val.replace('\n','') objs.append(string_val) location_map.__setitem__(y0,objs) else : if y0 not in header_ycord: header_ycord.append(y0) for key in location_map: print '**************************' # # print key print location_map.get(key) print '**************************' print 'Total Rowss = %s'%len(location_map)
def read_invoice_pdfminer3k(pdfFile): fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. invoice_text = "" for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): invoice_text += lt_obj.get_text() # Extract client info from the string extracted from pdf client = extract_info(invoice_text, client_start, client_end) print("client :" + client) # Extract invoice no from the pdf file name invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end) print("invoice no :" + invoice_no) # Pass the client info and invoice no to the method which writes to excel file write_excel(client, invoice_no)
def parse_pdf(fname): fp = open(fname, 'rb') # 来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager() # 设定参数进行分析 laparams=LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams) # 创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) # 处理每一页 contents = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout=device.get_result() for x in layout: if(isinstance(x, LTTextBoxHorizontal)): content = x.get_text().strip() # print type(content) # print content if content: contents.append(content) return contents
def extract_text_elements_from_pdf(path, j=nulljob): """Opens a PDF and extract every element that is text based (LTText). """ fp = open(path, 'rb') doc = PDFDocument(caching=True) parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] all_elements = [] enumerated_pages = list(enumerate(doc.get_pages())) progress_msg = "Reading page %i of %i" for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg): interpreter.process_page(page) page_layout = device.get_result() pages.append(Page(page_layout.width, page_layout.height)) textboxes = extract_textboxes(page_layout) elements = [create_element(box) for box in textboxes] merge_oneletter_elems(elements) for i, elem in enumerate(elements): elem.page = pageno elem.order = i all_elements += elements return pages, all_elements
def Layout(): # Set parameters for analysis. with open('/home/chris/Documents/Literature/Donghun_ACSNano_2014', 'rb') as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() print rsrcmgr laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() return layout
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, "rb") parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels}) page_index += 1 fp.close() return result
def pdf2txt(data,save_path): parser = PDFParser(data) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr,laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr,device) # for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for line in layout: try: if(isinstance(line,LTTextBoxHorizontal)): with open('%s'%(save_path),'a') as f: f.write(line.get_text().encode('utf-8') + '\n') except: print "failed!"
def get_num(source_file): fp = open(source_file,'rb') # fp = StringIO(source_file) #创建一个PDF文档解析器对象 parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 #提供密码初始化,没有就不用传该参数 document = PDFDocument(parser) #检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed #创建一个PDF资源管理器对象来存储共享资源 rsrcmgr = PDFResourceManager() #创建一个pdf设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) #创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) #处理文档当中的每个页面 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for n,l in enumerate(layout): if isinstance(l,LTTextBox): text = l.get_text() if n == 0: pass elif n == 1: num = text.split(":")[1].replace("\n",'') return num else: break
def extract_layout_by_page(pdf_path): """ See: - https://euske.github.io/pdfminer/programming.html - http://denis.papathanasiou.org/posts/2010.08.04.post.html """ laparams = LAParams() fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layouts.append(device.get_result()) return layouts
def pdf2text(path,save_file): ''' 读取本地pdf文档,并保存到txt文件中 :param path: 源pdf 文件 :param save_file: 保存txt文件名,无路径则保存到脚本同一目录 :return: 无 ''' #创建分析器 parser = PDFParser(path) #文档存储结构 document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr,laparams = laparams) interpreter = PDFPageInterpreter(rsrcmgr,device) #处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for line in layout: if(isinstance(line,LTTextBoxHorizontal)): with open('%s' %(save_file),'a') as f: f.write(line.get_text().encode('utf-8'))
def setup(path): # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF device object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # now extract dialogue from for i, page in enumerate(PDFPage.create_pages(document)): # skip the title page if i > 0: # process page with interpreter interpreter.process_page(page) # get layout info layout = device.get_result() # iterate through layout objects for obj in layout: # we only want to bother with LTTextBox and LTTextLine if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine): # only extract text segments within a certain margin range if obj.bbox[0] > DIALOGUE_BBOX_MIN and obj.bbox[0] < DIALOGUE_BBOX_MAX: # need to convert unicode characters converted = unicodedata.normalize('NFKD', obj.get_text()).encode('ascii', 'ignore') print(converted)
def _GetFromPdf(self,pdf): ''' 参考文档http://www.unixuser.org/~euske/python/pdfminer/programming.html ''' pass fp = open(pdf, 'rb') #用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个 PDF 文档 doc = PDFDocument(parser) # 连接分析器 与文档对象 parser.set_document(doc) # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for x in layout: if(isinstance(x, LTTextContainer)): print x.get_text() pass
def generateFileContent(self): import tempfile import urllib abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157" temporaryFile = tempfile.NamedTemporaryFile() urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name) entries = set() fileObject = open(temporaryFile.name, "rb") parser = PDFParser(fileObject) document = PDFDocument(parser) resourceManager = PDFResourceManager() device = PDFPageAggregator(resourceManager) interpreter = PDFPageInterpreter(resourceManager, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() objects = [object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve)] params = LAParams() for line in layout.group_objects(params, objects): text = line.get_text() if u":" in text: entry = text.split(u":")[0] entry = entry.strip() entry = entry.replace(u"..", ".") entries.add(entry) dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n" dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n" dictionary += u"\n" for entry in formatEntriesForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def get_layout(url, pages=None): """ The layout is an object of pdfminer corresponding to the tree structure of a pdf. More information about the layout here: http://www.unixuser.org/~euske/python/pdfminer/programming.html :param url: path (str) of the pdf file to be analysed :param pages: list (int) of pages of which you want the layout. Beware that the first page of the pdf correspond to number 0, even if its id is 1 :return layouts: List of layouts (One layout per page). """ if not pages: pagenums = set() else: pagenums = set(pages) # Set parameters for analysis. laparams = LAParams() manager = PDFResourceManager() # Create a PDF page aggregator object. device = PDFPageAggregator(manager, laparams=laparams) interpreter = PDFPageInterpreter(manager, device) layouts = [] with open(url, 'rb') as infile: for page in PDFPage.get_pages(infile, pagenos=pagenums): interpreter.process_page(page) layouts.append(device.get_result()) device.close() return layouts
def parse_pages(pdf_buffer, password): """ With an PDF buffer object, get the pages, parse each one, and return the entire pdf text """ # Create a PDF parser object associated with the file object. parser = PDFParser(pdf_buffer) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) resource_manager = PDFResourceManager() la_params = LAParams() device = PDFPageAggregator(resource_manager, laparams=la_params) interpreter = PDFPageInterpreter(resource_manager, device) text_content = [] # a list of strings, each representing text collected from each page of the doc for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for this page layout = device.get_result() # layout is an LTPage object which may contain # child objects like LTTextBox, LTFigure, LTImage, etc. text_content.append(parse_lt_objects(layout._objs)) # pylint: disable=protected-access return text_content
def getTemPdf(file): parser = PDFParser(file) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: f = open('result\\' + 'tem_pdf', 'w') f.write(''.encode('utf-8')) f.close() rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): with open('result\\' + 'tem_pdf', 'a') as f: sentence = x.get_text() f.write(sentence.encode('utf-8') + '\n') f.close() return_tem_pdf = open('result\\' + 'tem_pdf', 'rb') return return_tem_pdf
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ fh = open(path, "rb") result = {"pages": []} try: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, "") if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() v = string_value(v) if k != "pages" and v is not None and "<PDFObjRef:" not in v: result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages)) device.close() return result except PSEOF as eof: log.info("Unexpected EOF: %r", eof) return result finally: fh.close()
def convertWithCoordinatesPara(fname, pages=None): fontSize = {} pdfText = [] print fname if not pages: pagenums = set() else: pagenums = set(pages) infile = file(fname, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) laparams = LAParams() manager = PDFResourceManager() device = PDFPageAggregator(manager, laparams=laparams) interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() parse_obj_para(layout._objs, fontSize, pdfText) return {'fontSize': fontSize, 'pdfText': pdfText}
def extract_text(doc, config): rsrcmanager = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmanager, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmanager, device) pages = doc.get_pages() if config.page is not None: # take only 1 page # note: Use page-1 because of 0 index (where pdfs index from 1) pages = [next(itertools.islice(pages, config.page - 1, None), None)] for page in pages: interpreter.process_page(page) layout = device.get_result() text = [] for obj in layout: if isinstance(obj, LTTextBox): for line in obj: # coord = ((line.x0, line.y0), (line.x1, line.y1)) text.append(line) elif isinstance(obj, LTTextLine): assert False, "Expected no lines at top of tree" else: pass yield text
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def __init__(self, rsrcmgr, pageno=1, laparams=None): PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.rows = [] self.page_number = 0 self.outline = False # not an outline page self.interesting_text = [] # filled only if there is any outline info self.aux_text = [] # possibly helpful info, but maybe mixed
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use PyPDF2 to extract textual content first. If none is found, it'll send the file through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, languages) result['pages'].append(text) device.close() return result
def get_layout(path): '''returns a list of every character in the document as well as its location''' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() layout = [] device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) layout.append( device.get_result() ) fp.close() device.close() retstr.close() return layout
def parsing(pdfPath, pdfFileName): fp = open(pdfPath + '\\' + pdfFileName, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pathOut = r'C:\Projects\PDFparser\pageLayout' layoutName = pdfFileName.split('.', 1)[0].replace(' ','_') # Create a folder for each pdf file layout if not os.path.exists(layoutName): os.makedirs(pathOut + '\\' + layoutName) for pageNum, page in enumerate(PDFPage.create_pages(document)): interpreter.process_page(page) layout = device.get_result() parse_layout(layout) # .pmlo stands for PDFminer Layout fileOut = open(pathOut + '\\' + layoutName + '\\' + str(pageNum + 1) + '.pmlo','w') for line in layoutStream: fileOut.write(str(line)) #Start a new page del layoutStream[:] fp.close()
def Parse(self): # 先看是否有 cache,以及日期是否夠新 if not os.path.exists(parseCacheDir): os.makedirs(parseCacheDir) cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache') foundCache = (os.path.isfile(cacheFile) and \ os.path.getsize(cacheFile) > 0 and \ os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName)) if (foundCache): fp = open(cacheFile, 'rb') self.RawData = pickle.load(fp) fp.close() else: fp = open(self.pdfFileName, 'rb') for page in PDFPage.get_pages(fp, None, maxpages=1): rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) layout = device.get_result() self.__readobj(layout._objs) for category in self.RawData.values(): self.__reverseYaxis(category, layout.bbox[3]) cacheFp = open(cacheFile, 'wb') pickle.dump(self.RawData, cacheFp) cacheFp.close() fp.close() self.__calculateBoundary() self.__assignCharsAndLinesToCell() self.__processCells() return (self.effectiveFrom, self.__getResult())
def parse_pdf(pdf_url): remote_file = urllib.request.urlopen(pdf_url).read() memory_file = io.BytesIO(remote_file) parser = PDFParser(memory_file) doc = PDFDocument() parser.set_document(doc) #Warning sometimes, error in pdf? doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ret = [] # Process each page contained in the document. for pageIdx, page in enumerate(doc.get_pages()): ret.append([]) interpreter.process_page(page) layout = device.get_result() for idx, lt_obj in enumerate(layout): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if len(lt_obj.get_text().strip()) > 0: ret[pageIdx].append((lt_obj.get_text().splitlines())) return ret
def pdf_to_txt(in_file): """ turn a PDF file to a TXT file (roughly processed) """ # Open a PDF file. fp = open(in_file, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Set parameters for analysis. laparams = LAParams() # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # Receive the LTPage object for the page. layout = device.get_result() for klass in layout: if isinstance(klass, LTTextBoxHorizontal): out_file = in_file[:-3] + 'txt' with open(out_file, 'a') as dst_file: text = klass.get_text().encode('utf-8') dst_file.write(text + '\n') return None
def get_pdf_rows(data, miner_layout=True): """ Takes PDF file content as string and yield table row data for each page. For each page in the PDF, the function yields a list of rows. Each row is a list of cells. Each cell is a list of strings present in the cell. Note that the rows may belong to different tables. There are no logic tables in PDF format, so this parses PDF drawing instructions and tries to find rectangles and arrange them in rows, then arrange text in the rectangles. External dependencies: PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html). """ try: from pdfminer.pdfparser import PDFParser, PDFSyntaxError except ImportError: raise ImportError('Please install python-pdfminer') try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if miner_layout: device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) else: device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.get_pages(BytesIO(data), check_extractable=True) else: doc.initialize() pages = doc.get_pages() if LOGGER.isEnabledFor(DEBUGFILES): import tempfile import PIL.Image as Image import PIL.ImageDraw as ImageDraw import random path = tempfile.mkdtemp(prefix='pdf') for npage, page in enumerate(pages): LOGGER.debug('processing page %s', npage) interpreter.process_page(page) page_layout = device.get_result() texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], []) LOGGER.debug('found %d text objects', len(texts)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for t in texts: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color) draw.text((t.x0, t.y0), t.text.encode('utf-8'), color) fpath = '%s/1text-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) if not miner_layout: texts.sort(key=lambda t: (t.y0, t.x0)) # TODO filter ltcurves that are not lines? # TODO convert rects to 4 lines? lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))] LOGGER.debug('found %d lines', len(lines)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for l in lines: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color) fpath = '%s/2lines-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) lines = list(uniq_lines(lines)) LOGGER.debug('found %d unique lines', len(lines)) rows = build_rows(lines) LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for r in rows: for b in r: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) fpath = '%s/3rows-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) textrows = arrange_texts_in_rows(rows, texts) LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for row, trow in zip(rows, textrows): for b, tlines in zip(row, trow): color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color) fpath = '%s/4cells-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) yield textrows device.close()
#创建一个与文档相关联的解释器 parser = PDFParser(fp) #PDF文档对象 doc = PDFDocument(parser) #链接解释器和文档对象 parser.set_document(doc) doc.set_parser(parser) #初始化文档 doc.initialize("") #创建PDF资源管理器 resource = PDFResourceManager() #参数分析器 laparam = LAParams() #创建一个聚合器 device = PDFPageAggregator(resource, laparams=laparam) #创建PDF页面解释器 interpreter = PDFPageInterpreter(resource, device) #新建一个文档来写入数据 grades = open('grades.txt', 'w') for page in doc.get_pages(): #使用页面解释器来读取 interpreter.process_page(page) #使用聚合器来获得内容 layout = device.get_result() for out in layout: if hasattr( out, 'get_text' ): # 需要注意的是在PDF文档中不只有 text 还可能有图片等等,为了确保不出错先判断对象是否具有 get_text()方法 grades.write(out.get_text())
# Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): extracted_text = obj.get_text().replace('\n', '') coordinates = dict() coordinates['x1'] = obj.bbox[0]
def read_pdf_text(filename): """ use pdfminer to get the valid area of each page. all results are relative position! """ # 打开一个pdf文件 with open(filename, 'rb') as fp: # 创建一个PDF文档解析器对象 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 # 提供密码初始化,没有就不用传该参数 #document = PDFDocument(parser, password) document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed # 创建一个PDF资源管理器对象来存储共享资源 # caching = False不缓存 rsrcmgr = PDFResourceManager(caching=False) # 创建一个PDF设备对象 laparams = LAParams() # 创建一个PDF页面聚合对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理文档当中的每个页面 pageboxlist = [] text_list = [] # doc.get_pages() 获取page列表 # for i, page in enumerate(document.get_pages()): # PDFPage.create_pages(document) 获取page列表的另一种方式 # 循环遍历列表,每次处理一个page的内容 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 for item in layout: if isinstance(item, LTTextBox) or isinstance(item, LTTextLine): # 如果x是水平文本对象的话 if (isinstance(item, LTTextBoxHorizontal)): text = item.get_text() else: text = item.get_text() text_list.append(text) text = text.encode('utf-8') # print('text:{}'.format(text)) # elif isinstance(item, LTImage): # print('image:{}'.format(item)) # elif isinstance(item, LTFigure): # print('figure:{}'.format(item)) # elif isinstance(item, LTAnno): # print('anno:{}'.format(item)) # elif isinstance(item, LTChar): # print('char:{}'.format(item)) # elif isinstance(item, LTLine): # print('line:{}'.format(item)) # elif isinstance(item, LTRect): # print('rect:{}'.format(item)) # elif isinstance(item, LTCurve): # print('curve:{}'.format(item)) return text_list
for child in layout_obj: boxes.extend(find_textboxes_recursively(child)) return boxes return [] # その他の場合は空リストを返す。 # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams(detect_vertical=True) # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager() # ページを集めるPageAggregatorオブジェクトを作成。 device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 interpreter = PDFPageInterpreter(resource_manager, device) # 出力用のテキストファイル # output_txt = open('output.txt', 'w') def print_and_write(txt): print(txt) # output_txt.write(txt) # output_txt.write('\n') with open(sys.argv[1], 'rb') as f:
def _extract_qp(self, file_name): mark_sum = 0 print "Extracting QP" # Load pdf laparams = LAParams() rsrcmgr = PDFResourceManager() document = file(file_name, 'rb') device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) q_num = 0 for i, page in enumerate(PDFPage.get_pages(document)): # Get page layout interpreter.process_page(page) layout = device.get_result() # Extract metadata textboxes = [ r for r in layout._objs if type(r) is LTTextBoxHorizontal ] work_out_y = 0 answer_header = 0 marks = [] for t in textboxes: text = t.get_text() if "Answer space for question" in text: work_out_y = int(t.y0) elif "marks]" in text: marks.extend(find_between(text, "[", " marks]")) elif "mark]" in text: marks.extend(find_between(text, "[", " mark]")) elif "......" in text: # TODO: Find the correct amount of dots pass elif text in ["QUESTION\n", "PART\n", "REFERENCE\n"]: pass elif text in [ "Do not write\noutside the\n", "box\n", "Turn over s\n" ]: pass elif text == "Answer all questions.\n": answer_header = 74 else: pass # print repr(text) marks = [int(m) for m in marks] mark_sum += sum(marks) # Comver page into image img_path = "{}[{}]".format(file_name, i) img = Image(filename=img_path, resolution=int(72 * Paper.QUALITY)) # Set crop positions x = 46 * Paper.QUALITY y = (66 + answer_header) * Paper.QUALITY width = 489 * Paper.QUALITY height = (761 - answer_header - work_out_y) * Paper.QUALITY # Check for blank pages if height <= Paper.QUALITY or work_out_y <= 0: continue # Crop and save the image q_num += 1 img.crop(x, y, width=width, height=height) img_path = os.path.join(self._folder, "q{}.jpg".format(q_num)) img.save(filename=img_path) # Add question to questions self._questions.append(Question(img_path, q_num, marks)) print "Marks: {}".format(mark_sum)
def parse_pdf(path=None, data=None, savePath=None, y_tolerance=1.5, char_tolerance=0.5): ''' function : 处理pdf :param:词间最大间距,行间最大间距,输入路径,输出路径 :return 无 ''' # 记录page行数 pdfRowNumber = 0 theMaxColSize = [] wb = Workbook() ws = wb.active if data == None: data = open(path, 'rb') parser = PDFParser(data) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=None) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) height = page.mediabox[3] - page.mediabox[1] layout = device.get_result() pageContainer, theMaxColNum = get_line_word( layout, height, y_tolerance=y_tolerance, char_tolerance=char_tolerance) # 按照位置信息排序 for line in pageContainer: line.sort(key=itemgetter('x0')) pageContainer.sort(key=lambda line: line[0]['top']) if len(pageContainer[0]) < theMaxColNum: for i in range(len(pageContainer)): if len(pageContainer[i]) == theMaxColNum: repairList = align_front_row(pageContainer[0:i], theMaxColNum) del pageContainer[0:i] pageContainer.insert(0, repairList) break # 对最后一排进行判断 if len(pageContainer[-1]) < theMaxColNum: pageContainer[-1] = align_last_row(pageContainer[-2:], theMaxColNum) # 写入excel alignment = Alignment(horizontal='center', vertical='center') for idx, line in enumerate(pageContainer): for idy, item in enumerate(line): cellIndex = ws.cell(row=idx + 1 + pdfRowNumber, column=idy + 1) if item['text'] == '': pass elif item['text'] == None: ws.merge_cells(start_row=idx + 1 + pdfRowNumber, start_column=1, end_row=idx + 1 + pdfRowNumber, end_column=theMaxColNum) ws.cell(idx + 1 + pdfRowNumber, 1).alignment = alignment break else: if idx == 0 and len(line) == 2: pass else: cellIndex.alignment = alignment if item['text'].isdigit(): cellIndex.value = int(item['text']) cellIndex.number_format = '0' elif is_float(item['text']): cellIndex.value = float(item['text']) else: cellIndex.value = item['text'] thePageMaxColSize = [0 for i in range(theMaxColNum)] for line in pageContainer: if len(line) == 2: continue for col, item in enumerate(line): if len(item['text']) > thePageMaxColSize[col]: thePageMaxColSize[col] = len(item['text']) if theMaxColSize == []: theMaxColSize = thePageMaxColSize[:] else: for i in range(theMaxColNum): if theMaxColSize[i] < thePageMaxColSize[i]: theMaxColSize[i] = thePageMaxColSize[i] # 将该页的行数相加,使excel连续 pdfRowNumber += len(pageContainer) # 保存excel文件至本地 letter = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] for col, theSize in enumerate(theMaxColSize): rest = (col + 1) % 26 cut = int((col + 1) / 26) colLetter = '' if cut == 0: colLetter = letter[rest - 1] else: colLetter = letter[cut] + letter[rest - 1] ws.column_dimensions[colLetter].width = theSize * 2 if savePath != None: wb.save(savePath) else: wb.save(path.replace('.pdf', '.xlsx'))
def createDeviceInterpreter(): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter
def parse(_path): # fp = open(_path, 'rb') # rb以二进制读模式打开本地pdf文件 request = Request(url=_path, headers={'User-Agent': random.choice(user_agent) }) # 随机从user_agent列表中抽取一个元素 fp = urlopen(request) # 打开在线PDF文档 # 用文件对象来创建一个pdf文档分析器 praser_pdf = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser_pdf.set_document(doc) doc.set_parser(praser_pdf) # 提供初始化密码doc.initialize("123456") # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 pa = 0 pdf_news = '' for page in doc.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() i = 0 # pdf_news = '' # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 # if hasattr(out,"get_text"): i += 1 # if i <= 15: # 只取pdf的前五行 if isinstance(out, LTTextBoxHorizontal): results = out.get_text() pdf_news += results # print(results) # 如果存在多页则取前三页 pa += 1 if pa >= 4: # print(pdf_news) # return pdf_news break # 只取文档的第一页 # print(pdf_news) return pdf_news
import calendar f = open("hrefs.txt", "r", encoding="utf-8") hrefs = eval(f.read()) f.close() f = open("DOIs.txt", "r", encoding="utf-8") DOIs = eval(f.read()) f.close() f = open("month_year.txt", "r", encoding="utf-8") month_year = eval(f.read()) f.close() # show warning logging.propagate = False logging.getLogger().setLevel(logging.ERROR) device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams()) interpreter = PDFPageInterpreter(PDFResourceManager(), device) month_year_no = [] DOIs_no = [] texts = [] for i in range(len(hrefs)): for j in range(len(DOIs[i])): name = month_year[i].split(" ")[0] + "-" + month_year[i].split( " ")[1] + "-" + DOIs[i][j].replace("/", "") file_path = "C:/Users/Administrator/Desktop/AER/extract/" + month_year[ i].split(" ")[0] + "-" + month_year[i].split( " ")[1] + "/" + name + ".pdf" if os.path.exists(file_path) and isValidPDF_pathfile(file_path): try: doc = PDFDocument()
import os from pdfminer.pdfpage import PDFPage from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter # Uma alternativa é o Py2PDF, mas o encoding do módulo não funciona das melhores formas # TODO : jogar toda essa rotina para a tools dirname = os.path.split(__file__)[0] file = open("{}/../static/20180808CAPDJETJRJ_1.pdf".format(dirname), 'rb') laparams = LAParams() pdfrm = PDFResourceManager() parser = PDFParser(file) document = PDFDocument(parser) device = PDFPageAggregator(pdfrm, laparams=laparams) interpreter = PDFPageInterpreter(pdfrm, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for i in layout: print(i)
def rename_file(filename, path): # given a file and a path to a pdf downloaded from drf.com with either the time # or beyer figure results from a race (ex. TUP--04-30-2017 (1).pdf) extracts the # name, date, and number of the race, as well as the horses that ran in it, checks # that the former matches the filename, and renames the file. # (ex. TUP170430_2_lb.pdf (beyer figure) or TUP170430_2_lt.pdf (time)). # returns the list of horses and the new filename as a tuple or None if the name doesn't # match the file contents. if write_flag: fw = open('new_pdfs.txt', "w") # group: 0-filename, 1-track, 2-month, 3-day, 4-year, 5-race number progN = re.compile(r'([A-Z]+)(\d{2})(\d{2})(\d{2})_(\d+)_(l[bt])\.pdf' ) # if file is already renamed prog = re.compile(r'([A-Z]+)--(\d+)-(\d+)-(\d+) ?\(?(\d*)\)?\.pdf' ) # if file hasn't yet been renamed if progN.fullmatch(filename): m = progN.match(filename) num = int(m.group(5)) date = months[int(m.group(3)) - 1] + ' ' + str(int(m.group(4)) + ', 20' + m.group(2)) track = race_abbrev[m.group(1)] new_name = filename elif prog.fullmatch(filename): m = prog.match(filename) num = m.group(5) if num == '': num = 1 else: num = int(num) + 1 date = months[int(m.group(2)) - 1] + ' ' + str(int( m.group(3))) + ', ' + m.group(4) track = race_abbrev[m.group(1)] new_name = m.group(1) + m.group(4)[-2:] + m.group(2) + m.group( 3) + "_" + str(num) if timefolder in path: new_name += time_ending else: new_name += beyer_ending shutil.copy(os.path.join(path, filename), os.path.join(new_pdf_folder, new_name)) else: shutil.copy(os.path.join(path, filename), os.path.join(incorrectfiles, filename)) return None, None, None, None if new_name.replace('.pdf', '.csv') in all_csvs_list: return None, None, None, None race = races[num - 1] # open file as a pdfminer layout to be parsed fp = open(os.path.join(path, filename), 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) unmatched = None horses = None top = None bottom = None # parse the pdf to make sure the name, date, and track match, and get a list of the horse names for page in doc.get_pages(): if unmatched or horses: break interpreter.process_page(page) layout = device.get_result() unmatched, horses, top, bottom = parse_layout(layout, None, [], None, None, race, date, track) # copy the file to a new folder with the corrected file name if it contained the key words fp.close() if unmatched == []: if write_flag: fw.write(new_name + "\n") else: return horses, new_name, top, bottom else: shutil.copy(os.path.join(path, filename), os.path.join(incorrectfiles, filename)) return None, None, None, None
def _extract_ms(self, file_name): print "Extracting MS" self._new_questions = [] self._questions.reverse() current_question = self._questions.pop() # Load PDF laparams = LAParams() rsrcmgr = PDFResourceManager() document = file(file_name, 'rb') device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.get_pages(document)): if i != 3: continue # Convert pdf page to png img_path = os.path.join(self._folder, "temp.png") with Image(filename=file_name + "[{}]".format(i), resolution=int(72 * Paper.QUALITY)) as img: with Image(width=img.width, height=img.height, background=Color("white")) as background: background.composite(img, 0, 0) background.save(filename=img_path) # Load png to crop with Image(filename=img_path) as img: # Set initial crop x = 28 * Paper.QUALITY y = 0 width = 567 * Paper.QUALITY height = img.height # Get page layout interpreter.process_page(page) layout = device.get_result() # Extract metadata textboxes = [ r for r in layout._objs if isinstance(r, LTTextBoxHorizontal) ] ms_top = 0 for textbox in textboxes: text = textbox.get_text() if text in [ "Solution \n", "Mark \n", "Total \n", "Comment \n" ]: if ms_top == text.y0 or ms_top == 0: ms_top = text.y0 else: pass # Crop up to this point and reset (2 questions on page) else: print repr(text) path = os.path.join(self._folder, "m{}.png".format(i)) img.crop(x, y, width, height) img.save(filename=path) # # Extract metadata # textboxes = [r for r in layout._objs if isinstance( # r, LTTextBoxHorizontal)] # q_num = 0 # q_y = 0 # q_height = 0 # for textbox in textboxes: # text = textbox.get_text() # q_passed = False # if text.startswith("Q"): # q_passed = True # try: # q_num = int(text[1]) # except ValueError: # pass # elif text.startswith("\nQ"): # q_passed = True # try: # q_num = int(text[3]) # except ValueError: # pass # if q_passed: # if q_y > 0: # y = q_y # new_q_y = img.height - int(textbox.y1) - 4 # height = new_q_y - q_y # self._save_cropped("m{}".format( # q_num), img, img_x, y, img_width, height) # q_y = new_q_y # else: # q_y = img.height - int(textbox.y1) - 4 # if q_num > 0: # self._save_cropped("m{}".format( # q_num), img, img_x, q_y, img_width, img_height) os.remove(img_path)
# ab+ 以二进制读写模式打开 fp = open('A Compact and Embedded Balanced.pdf', 'rb') #获取文档对象 #rb --> 以二进制读的模式打开 #若要读取网络上的PDF: #from urllib.request import urlopen #fp = urlopen('http://gk.xmu.edu.cn/_upload/article/files/8f/f0/03862022456bb3a0421b8be3223d/90eeb364-6195-4cd3-958b-5bcc0bb71b99.pdf') parser = PDFParser(fp) #创建一个与文档关联的解释器 doc = PDFDocument() #PDF文档对象 parser.set_document(doc) #连接解释器和文档对象 doc.set_parser(parser) #连接解释器和文档对象 doc.initialize('') #初始化文档 #因为文档没有密码,所以为空 resource = PDFResourceManager() #创建PDF资源管理器 laparam = LAParams() #参数分析器 device = PDFPageAggregator(resource, laparams=laparam) #创建聚合器=资源管理器+参数分析器 interpreter = PDFPageInterpreter(resource, device) #创建PDF页面解释器 for page in doc.get_pages(): #使用文档对象得到页面的集合 interpreter.process_page(page) #使用页面解释器来读取 layout = device.get_result() #使用聚合器来获得内容 for out in layout: if hasattr(out, 'get_text'): #判断out是否有get_text这个属性 print(out.get_text())
def scan_PDF(): log = [] for files in os.walk(exe_path): # 遍历文件夹下的文件 for i in range(0, len(files[2])): #[0]是根目录,[1]是子文件夹,[2]是子文件 if os.path.splitext(files[2][i])[1] == '.pdf' or os.path.splitext( files[2] [i])[1] == '.PDF': # 验证后缀名是否为pdf,os.path.splitext方法用来分割后缀名 file_name = files[2][i] try: #*******************************************************获取数据******************************************************** file = open(os.path.join(exe_path, file_name), 'rb') parser = PDFParser( file ) # Translate the binary file to recognizable datastream (PDFParser object) document = PDFDocument() # 创建一个PDF文档 parser.set_document(document) # 连接分析文档 document.set_parser(parser) document.initialize() # pdf 初始化 resource = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator( resource, laparams=laparams) # 创建一个PDF设备对象 interpreter = PDFPageInterpreter(resource, device) # 创建一个PDF解释器对象 content = [] i = 0 for page in document.get_pages(): interpreter.process_page( page ) # 解释器完成解释后,将内容传送至device里(device里包含资源管理器的PDF资源) layout = device.get_result() for raw_data in layout: if ( isinstance(raw_data, LTTextBoxHorizontal) ): # 这里来判断取得的元素,我们这里要取得的是TextBox内容,其他内容或样例请直接print(x)查看 #print(i,raw_data.get_text()) content.append( raw_data.get_text()) # 将获得的内容赋值至content中 #i=i+1 #********************************************************************************************************************** #*******************************************************识别数据******************************************************** tag = 0 # 验证是否存在重大缺陷等问题存在,如发现异常则加1 validation = -1 # 验证是否存在数据无法读取,默认为无法读取,能读取数据后赋值为0 for i in range(0, len(content)): if content[i].split(" ")[0].find("公司代码") > -1: print("公司代码: " + content[i].split(":")[1].split(" ")[0]) sec_code = content[i].split(":")[1].split(" ")[ 0] # 读取证券代码 print("公司简称: " + content[i].split(":")[2]) sec_name = content[i].split( ":")[2].strip() # 读取证券名称 validation = 0 if (content[i].find("重大缺陷") > -1 or content[i].find("重要缺陷") > -1 or content[i].find("一致") > -1 ) and content[i + 1].find("√") > -1: #print(content[i]) #print(content[i+1]) if content[i].find("一致") > -1: question = content[i].split(" ")[1] if content[i + 1].find("√否") > -1: answer = "不一致" tag = tag + 1 elif content[i + 1].find("√是") > -1: answer = "一致" else: continue log.append( [sec_code, sec_name, question, answer]) elif (content[i].find("重大缺陷") > -1 and content[i].split(" ")[1] == "重大缺陷") or ( content[i].find("重要缺陷") > -1 and content[i].split(" ")[1] == "重要缺陷"): question = content[i + 1].split(" ")[0] if content[i + 1].find("√是") > -1: answer = "是" tag = tag + 1 elif content[i + 1].find("√否") > -1: answer = "否" else: continue log.append( [sec_code, sec_name, question, answer]) elif (content[i].find("重大缺陷") > -1 and content[i].split(" ")[1] != "重大缺陷") or ( content[i].find("重要缺陷") > -1 and content[i].split(" ")[1] != "重要缺陷"): question = content[i].split(" ")[1] if content[i + 1].find("√是") > -1: answer = "是" tag = tag + 1 elif content[i + 1].find("√否") > -1: answer = "否" else: continue log.append( [sec_code, sec_name, question, answer]) else: continue print(tag, validation) if tag > 0: # 如果tag大于0,表示有异常,则将异常结果写入文件 file = open(os.path.join(exe_path, 'Final_Result.txt'), 'a+') # 写入访问记录 file.write(sec_code + " " + sec_name + "\r\n") file.close() if validation == -1: # 如果等于-1则为数据无法读取,需要人工核查 file = open(os.path.join(exe_path, 'Error_Log.txt'), 'a+') # 写入访问记录 file.write(os.path.join(exe_path, file_name) + "\r\n") file.close() #*********************************************************************************************************************** except: # 如果文件本身错误,则写入错误日志,待人工核查 file = open(os.path.join(exe_path, 'Error_Log.txt'), 'a+') # 写入访问记录 file.write(os.path.join(exe_path, file_name) + "\r\n") file.close() else: continue workbook = xlsxwriter.Workbook(os.path.join(exe_path, "Log_File.xlsx")) worksheet = workbook.add_worksheet("Log_File") columns = ['Sec_Code', 'Sec_Name', 'Question', 'Answer'] for i in range(0, 4): worksheet.write(0, i, columns[i]) for i in range(0, len(log)): for j in range(0, 4): worksheet.write(i + 1, j, log[i][j]) workbook.close() print("识别完成") os.system('pause')
class PdfFileParser(object): def __init__(self, infile, outfile=None, password=None, selectedpages=None, maxSplit=3, W=1440.0, H=1080.0, outputJson=False, trimbox=None, trimboxes=None, exclude=False, debug=0): self.args = { a[0]: a[1] for a in locals().items() if a[0] not in ['self', 'outputJson'] } self.outputJson = outputJson self.DEBUG = debug self.picklefile = infile + '.pickle' self.selectedpages = selectedpages self.pickleLoaded = False self.savedconfig = None self.coords = [] self.pagesCoords = [] self.trimbox = trimbox self.trimboxes = trimboxes self.exclude = exclude self.pageRanges = SelectedPages(selectedpages) if ENABLE_PICKLE and os.path.isfile(self.picklefile): try: with open(self.picklefile, 'rb') as f: self.savedconfig = pickle.load(f) savedargs = self.savedconfig['args'] equal = True for k, v in self.args.items(): if k == 'selectedpages': if v not in SelectedPages(savedargs[k]): equal = False elif k not in savedargs: equal = False elif v != savedargs[k]: equal = False if not equal: break if equal: self.pickleLoaded = True self.pagesCoords = self.savedconfig['pagesCoords'] except Exception, e: print e self.fname = infile self.W = float(W) self.H = float(H) self.maxSplit = maxSplit self.outfile = outfile if self.outfile == None: outFilename, outExt = os.path.splitext(infile) self.outfile = outFilename + '-out' + outExt if not (self.selectedpages == None or self.selectedpages == ''): outFilename, outExt = os.path.splitext(self.outfile) self.outfile = '%s(%s)%s' % (outFilename, self.selectedpages, outExt) if os.path.isfile(self.outfile): i = 1 outfile, outExt = os.path.splitext(self.outfile) while os.path.isfile("%s(%d)%s" % (outfile, i, outExt)): i += 1 self.outfile = "%s%d%s" % (outfile, i, outExt) self.password = password self.endPage = self.pageRanges.getEndPage( 30000) - 1 # 1 base vs 2 base self.inFile = open(self.fname, 'rb') self.parser = PDFParser(self.inFile) self.document = PDFDocument(self.parser) self.rsrcmgr = PDFResourceManager() self.laparams = LAParams() if not self.pickleLoaded: self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams) self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) self.pagesEnumerator = enumerate( PDFPage.create_pages(self.document))
from pdfminer.layout import * from pdfminer.converter import PDFPageAggregator fp = open('D:\\temp\\pdf_html\\data\\sub\\1202098108.pdf', 'rb') #来创建一个pdf文档分析器 parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): print x, x.get_text() # with open('a.txt','a') as f: # f.write(x.get_text().encode('utf-8')+'\n')
def get_columns(fname): document = open(fname, 'rb') rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] for page in PDFPage.get_pages(document): pages.append(page) num_pages = len(pages) doc_part_rects = [] doc_text_boxes = [] leftmost_x = 150 rightmost_x = 350 minimum_height_top = 150 minimum_height_bottom = 250 round_up = 4 page_num = 0 for page in pages: interpreter.process_page(page) layout = device.get_result() doc_text_boxes.append([]) doc_part_rects.append([]) topmost_y = 900 * (num_pages - page_num) bottommost_y = 900 * (num_pages - page_num - 1) # assume there is a single rectangle that partions the page into 2 sections part_rects = [(leftmost_x, bottommost_y, rightmost_x, topmost_y)] layout = sorted(layout, key=lambda element: (element.bbox[2] - element.bbox[0])) # for each text box eliminate possibility of possible partion rectangles for element in layout: if isinstance(element, LTTextBoxHorizontal): element.bbox = list(element.bbox) element.bbox[1] = (int(element.bbox[1]) / round_up) * round_up + bottommost_y element.bbox[3] = (int(element.bbox[3]) / round_up) * round_up + bottommost_y doc_text_boxes[page_num].append(element) new_part_rects = [] for part_rect in part_rects: if not overlap_rects(part_rect, element.bbox): new_part_rects.append(part_rect) else: if element.bbox[0] <= part_rect[ 0] and element.bbox[2] >= part_rect[2]: if (part_rect[3] - element.bbox[3]) > minimum_height_top: new_part_rects.append( (part_rect[0], element.bbox[3], part_rect[2], part_rect[3])) if (element.bbox[1] - part_rect[1]) > minimum_height_bottom: new_part_rects.append( (part_rect[0], part_rect[1], part_rect[2], element.bbox[1])) elif element.bbox[0] <= part_rect[ 0] and element.bbox[2] > part_rect[0]: new_part_rects.append( (element.bbox[2], part_rect[1], part_rect[2], part_rect[3])) if part_rect[3] == topmost_y and ( part_rect[3] - element.bbox[3]) > minimum_height_top: new_part_rects.append( (part_rect[0], element.bbox[3], part_rect[2], part_rect[3])) if part_rect[1] == bottommost_y and ( element.bbox[1] - part_rect[1]) > minimum_height_bottom: new_part_rects.append( (part_rect[0], part_rect[1], part_rect[2], element.bbox[1])) elif element.bbox[0] < part_rect[ 2] and element.bbox[2] >= part_rect[2]: new_part_rects.append( (part_rect[0], part_rect[1], element.bbox[0], part_rect[3])) if part_rect[3] == topmost_y and ( part_rect[3] - element.bbox[3]) > minimum_height_top: new_part_rects.append( (part_rect[0], element.bbox[3], part_rect[2], part_rect[3])) if part_rect[1] == bottommost_y and ( element.bbox[1] - part_rect[1]) > minimum_height_bottom: new_part_rects.append( (part_rect[0], part_rect[1], part_rect[2], element.bbox[1])) elif element.bbox[0] > part_rect[ 0] and element.bbox[2] < part_rect[2]: new_part_rects.append( (part_rect[0], part_rect[1], element.bbox[0], part_rect[3])) new_part_rects.append( (element.bbox[2], part_rect[1], part_rect[2], part_rect[3])) if part_rect[3] == topmost_y and ( part_rect[3] - element.bbox[3]) > minimum_height_top: new_part_rects.append( (part_rect[0], element.bbox[3], part_rect[2], part_rect[3])) if part_rect[1] == bottommost_y and ( element.bbox[1] - part_rect[1]) > minimum_height_bottom: new_part_rects.append( (part_rect[0], part_rect[1], part_rect[2], element.bbox[1])) elif element.bbox[0] == part_rect[ 2] or element.bbox[2] == part_rect[0]: new_part_rects.append(part_rect) else: print(part_rect) print(element.bbox) raise Exception( "Unhandled case in overlaping rectangles") part_rects = new_part_rects largest_lower_rect_height = 0 largest_upper_rect_height = 0 largest_lower_rect = None largest_upper_rect = None single_largest_rect_present = False single_largest_rect = None for part_rect in part_rects: if part_rect[1] == bottommost_y and part_rect[3] == topmost_y: single_largest_rect_present = True single_largest_rect = part_rect break elif part_rect[1] == bottommost_y and ( part_rect[3] - part_rect[1]) > largest_lower_rect_height: largest_lower_rect = part_rect largest_lower_rect_height = (part_rect[3] - part_rect[1]) elif part_rect[3] == topmost_y and ( part_rect[3] - part_rect[1]) > largest_upper_rect_height: largest_upper_rect = part_rect largest_upper_rect_height = (part_rect[3] - part_rect[1]) if single_largest_rect_present: doc_part_rects[page_num].append(single_largest_rect) else: if largest_lower_rect: doc_part_rects[page_num].append(largest_lower_rect) if largest_upper_rect: doc_part_rects[page_num].append(largest_upper_rect) page_num += 1 flat_doc_text_boxes = [] flat_doc_part_rects = [] column_score = 0 for i in range(num_pages): text_section_height_score = ( max(doc_text_boxes[i], key=lambda element: element.bbox[3]).bbox[3] - min(doc_text_boxes[i], key=lambda element: element.bbox[1]).bbox[1]) / 900 for part_rect in doc_part_rects[i]: column_score += (part_rect[3] - part_rect[1]) * text_section_height_score flat_doc_part_rects.append(part_rect) for element in doc_text_boxes[i]: flat_doc_text_boxes.append(element) column_score = column_score / (900 * num_pages) if column_score < 0.4: flat_doc_part_rects = [flat_doc_part_rects[0]] return flat_doc_text_boxes, flat_doc_part_rects, num_pages
def render_image(self, name, stream): if self.imagewriter is None: return PDFPageAggregator.render_image(self, name, stream) return
def __init__(self, filename, laparams = None): self.fp = open(filename, 'rb') resources = PDFResourceManagerNew() self.device = PDFPageAggregator(resources, laparams=laparams) self.interpreter = PDFPageInterpreter(resources, self.device) self.val = dict()
def load_file_text(self, import_file): """ Import from file types of odt, docx pdf, epub, txt, html, htm. """ text = "" # Import from odt if import_file[-4:].lower() == ".odt": text = self.convert_odt_to_text(import_file) # Import from docx if import_file[-5:].lower() == ".docx": #text = convert(importFile) # uses docx_to_html document = opendocx(import_file) list_ = getdocumenttext(document) text = "\n".join(list_) # Import from epub if import_file[-5:].lower() == ".epub": book = epub.read_epub(import_file) for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): #print(d.get_content()) bytes_ = d.get_body_content() string = bytes_.decode('utf-8') text += html_to_text(string) + "\n" # import PDF if import_file[-4:].lower() == '.pdf': fp = open(import_file, 'rb') # read binary mode parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) # potential error with encrypted PDF doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): text += lt_obj.get_text() # import from html if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm": importErrors = 0 with open(import_file, "r") as sourcefile: fileText = "" while 1: line = sourcefile.readline() if not line: break fileText += line text = html_to_text(fileText) QtWidgets.QMessageBox.warning(None, _('Warning'), str(importErrors) + _(" lines not imported")) # Try importing as a plain text file. if text == "": import_errors = 0 try: with open(import_file, "r") as sourcefile: while 1: line = sourcefile.readline() if not line: break try: text += line except Exception as e: #logger.debug("Importing plain text file, line ignored: " + str(e)) import_errors += 1 if text[0:6] == "\ufeff": # associated with notepad files text = text[6:] except Exception as e: QtWidgets.QMessageBox.warning(None, _('Warning'), _("Cannot import ") + str(import_file) + "\n" + str(e)) return if import_errors > 0: QtWidgets.QMessageBox.warning(None, _('Warning'), str(import_errors) + _(" lines not imported")) logger.warning(import_file + ": " + str(import_errors) + _(" lines not imported")) # import of text file did not work if text == "": QtWidgets.QMessageBox.warning(None, _('Warning'), _("Cannot import ") + str(import_file) + "\n" + str(e)) return # Final checks: check for duplicated filename and update model, widget and database nameSplit = import_file.split("/") filename = nameSplit[-1] if any(d['name'] == filename for d in self.source): QtWidgets.QMessageBox.warning(None, _('Duplicate file'), _("Duplicate filename.\nFile not imported")) return entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "", 'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} cur = self.settings['conn'].cursor() #logger.debug("type fulltext: " + str(type(entry['fulltext']))) cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)", (entry['name'], entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date'])) self.settings['conn'].commit() cur.execute("select last_insert_rowid()") id_ = cur.fetchone()[0] entry['id'] = id_ self.parent_textEdit.append(entry['name'] + _(" imported.")) self.source.append(entry)
def __init__(self, rsrcmgr, pageno=1, laparams=None): PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.rows = [] self.page_number = 0
parser = PDFParser(fp) # # Create a PDF document object that stores the document structure. doc = PDFDocument() # # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # # (If no password is set, give an empty string.) # doc.initialize(password) # # Check if the document allows text extraction. If not, abort. #if not doc.is_extractable: # raise PDFTextExtractionNotAllowed # # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # # Create a PDF device object. #device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # # Process each page contained in the document. text_content = [] for page in doc.get_pages(): interpreter.process_page(page) lt_objs = device.get_result() for lt_obj in lt_objs: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): #page_text = update_page_text_hash(page_text, lt_obj) print lt_obj.get_text() print "\n \n new =================="
class StatementReader: """ Reader Class of a pdf statement """ def __init__(self, file: Union[str, Path]): self.start_balance = None self.transaction_list = [] self.f = open(file, 'rb') resource_manager = PDFResourceManager() params = LAParams() self.device = PDFPageAggregator(resource_manager, laparams=params) self.interpreter = PDFPageInterpreter(resource_manager, self.device) def read_statement(self): """ Read the pdf statement pages :return: statement pages """ # Process each page contained in the statement. page_list = [] for page in PDFPage.get_pages(self.f): str_list = self.read_page(page) page_list.append(str_list) return page_list def read_page(self, page: Iterator[PDFPage]): """ Read a page from the statement :param page: statement page :return: strings of the page """ characters = [] self.interpreter.process_page(page) layout = self.device.get_result() for box in layout: if isinstance(box, LTTextBoxHorizontal): characters.extend(extract_characters(box)) # Create list of characters char_list = [ Char(char) for char in characters if isinstance(char, LTChar) ] char_list = sorted(char_list, key=lambda char: char.y0, reverse=True) # Attribute a row number to each character char_list[0].row = 0 for i in range(1, len(char_list)): if (char_list[i - 1].y0 - char_list[i].y0) > CHAR_HEIGHT / 2: char_list[i].row = char_list[i - 1].row + 1 else: char_list[i].row = char_list[i - 1].row char_list = sorted(char_list, key=lambda char: (char.row, char.x0)) # Create list of strings str_list = [] previous_row = char_list[0].row previous_col = char_list[0].col i = 1 while i < len(char_list): current_row = char_list[i].row current_col = char_list[i].col current_col_name = char_list[i].col_name string = String(current_row, current_col, current_col_name) while True and i < len(char_list): if (char_list[i].row == previous_row and char_list[i].col == previous_col): if (char_list[i].x0 - char_list[i - 1].x1) > CHAR_WIDTH: string.text = ' '.join( (string.text, char_list[i].text)) else: string.text = ''.join((string.text, char_list[i].text)) else: previous_row = char_list[i].row previous_col = char_list[i].col string.clean() str_list.append(string) break i = i + 1 str_list = iter(sorted(str_list, key=lambda x: (x.row, x.col))) return str_list def get_statement_details(self): """ Map the strings from the all the statement pages to attributes """ page_list = self.read_statement() for str_list in page_list: self.get_transaction_details(str_list) def get_transaction_details(self, str_list: Iterator[String]): """ Map the strings from the page to attributes :param str_list: string list of a page """ while True: string = next(str_list, None) if string is None: break # First BALANCE BROUGHT FORWARD elif string.text == 'BALANCE BROUGHT FORWARD': string = next(str_list) if self.start_balance is None: # Some time, there is a '.' in the first line so we pass it if string.text == '.': string = next(str_list) self.start_balance = to_float(string.text) string = next(str_list) # Last BALANCE BROUGHT FORWARD while string.text != 'BALANCE CARRIED FORWARD': current_row = string.row new_transaction = False date = None method_symbol = None entity = None amount = 0 while string.row == current_row: if string.text == 'BALANCE CARRIED FORWARD': break if string.col_name == 'date': date = to_date_str(to_date(string.text)) elif string.col_name == 'payment_type': method_symbol = string.text new_transaction = True elif string.col_name == 'entity': entity = string.text elif string.col_name == 'paid_out': amount = amount - to_float(string.text) elif string.col_name == 'paid_in': amount = amount + to_float(string.text) elif string.col_name == 'balance': pass else: raise ValueError('col name not found') string = next(str_list) else: if new_transaction: if date is None: prev_transaction = self.transaction_list[-1] date = prev_transaction['date'] transaction = dict(date=date, method=METHOD[method_symbol], method_symbol=method_symbol, entity=entity, amount=amount, ccy=CCY, account=ACCOUNT) self.transaction_list.append(transaction) else: prev_transaction = self.transaction_list[-1] prev_transaction['amount'] = amount prev_transaction['entity'] = ' '.join( (prev_transaction['entity'], entity)) self.transaction_list[-1] = prev_transaction else: break def close_statement(self): """ Close the pdf statement """ self.f.close() self.device.close()
def parse_pdf(pdf_path): """ 读取pdf文件,保存为 待修改:如果不能直接读取文字,尝试使用Tesseract (OCR庫) :param pdf_path: :return: """ fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x,LTImage): # 图片对象 num_image += 1 if isinstance(x,LTCurve): # 曲线对象 num_curve += 1 if isinstance(x,LTFigure): # figure对象 '''2018.11.27 修改 发现部分pdf文本被识别为LFTFigure对象,将该文本内容添加入结果''' num_figure += 1 # 保存文本内容 new_path = pdf_path[:-3]+'txt' with open(new_path, 'a',encoding='utf8') as f: x.get_textboxes() for x_in in x: if isinstance(x_in, LTChar): results = x_in.get_text() f.write(results) if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 new_path = pdf_path[:-3]+'txt' with open(new_path, 'a',encoding='utf8') as f: results = x.get_text() f.write(results + '\n') print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n' %num_TextBoxHorizontal)
def parse(fileName): text_path = upload_path + fileName + ".pdf" hmPdfSaveName = "" fileOpen = open(text_path,'rb') doc = PDFDocument() parser = PDFParser(fileOpen) parser.set_document(doc) doc.set_parser(parser) doc.initialize() #原文件 hmPdfReaderPDF = PyPDF2.PdfFileReader(fileOpen) #待写入数据文件 hmPdfWriter = PyPDF2.PdfFileWriter() #检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() #创建一个PDF设备对象 device = PDFPageAggregator(rsrcmgr,laparams=LAParams()) #创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr,device) openFileArr = [] allPages = doc.get_pages() for page in allPages: interpreter.process_page(page) layout = device.get_result() textValueArr = [] for x in layout: if(isinstance(x,LTTextBoxHorizontal)): textValueArr.append(x.get_text()) pdfTxt = ''.join(textValueArr) textValueArr.clear() dpText = bgGetProductDp(pdfTxt) gtList = ['CWF','CWFS','CWFH','CWFN','CSB'] if (gtList.__contains__(dpText) == False): continue #init hmPd = HmProduct() #生产车间 hmPd.productDp = dpText #生产单编号 hmPd.productCasNum = bgGetProductInvoicesNum(pdfTxt) #产品编号 hmPd.productNum = bgGetProductNumber(pdfTxt,hmPd) #规格 hmPd.productSf = bgGetProductSpecification(pdfTxt,hmPd) #销售单号 hmPd.productSealNum = bgGetProductSealNum(pdfTxt) #订单数量/单位 hmPd.productCount = getHmProductCount(pdfTxt,hmPd) #客人号 hmPd.productGuest = bgGetProductGuest(pdfTxt) #产品批次 hmPd.productBatch = bgGetProductBatch(pdfTxt) #产品中文描述 hmPd.productRamk = bgGetProductDetilRamk(pdfTxt,hmPd) #生成生产单uuid hmPd.hm_pd_uuid = bgGetPageMd5(hmPd) #根据总表更新生产单日期 #updateOutDate(hmPd,pdfTxt) #----------------1、生成文件_STAR----------------# payStr = '' layoutPageId = layout.pageid - 1 #生成【眼】单 if hmPd.hm_pd_uuid : newPage = hmPdfReaderPDF.getPage(layoutPageId) ePatch = hmCreateQRCode(hmPd,'E',payStr) eMarkFile = open(ePatch,'rb') pdfECodePage = PyPDF2.PdfFileReader(eMarkFile) newPage.mergePage(pdfECodePage.getPage(0)) hmPdfWriter.addPage(newPage) openFileArr.append(eMarkFile) del newPage del pdfECodePage gc.collect() #用销售单号做文件名 if hmPdfSaveName == "": hmPdfSaveName = hmPd.productSealNum #----------------1、生成文件_END----------------# #完结时关闭文件和保存文件 #----------------生成文件时关闭----------------# nowTime = datetime.datetime.now() nowTimeStr = nowTime.strftime("%Y%m%d%H%M%S_s") hmPdfSaveName = nowTimeStr +"_"+ hmPdfSaveName+ ".pdf" hmPdfSavePath = download_path + hmPdfSaveName resultPdfFile = open(hmPdfSavePath,'wb') hmPdfWriter.write(resultPdfFile) for closeItem in openFileArr : closeItem.close() os.remove(closeItem.name) openFileArr.clear() resultPdfFile.close() fileOpen.close() return hmPdfSaveName
def extract_text_from_pdf(pdf_path): new_dict = {} #To store extracted data as key, value pairs lines = [] #To store data alternatively as list counter = 1 #increments when <END> of block is reached a = [] #dummy array to append elements of any section #Reset switches for data table = False partNo = False notes = False qty = False partname = False see = False ending = True new_dict["Metadata_%d" % counter] = {} #PDF Miner Objects resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() codec = 'utf-8' laparams = LAParams() converter = PDFPageAggregator(resource_manager, laparams=laparams) page_interpreter = PDFPageInterpreter(resource_manager, converter) w = 0 with open(pdf_path, 'rb') as fh: #text_from_pdf = open('text2FromPdf.txt','w') for pageNumber, page in enumerate( PDFPage.get_pages(fh, caching=True, check_extractable=True)): if pageNumber > 31 and pageNumber < 923: page_interpreter.process_page(page) layout = converter.get_result() for element in layout: if isinstance(element, LTTextBox) or isinstance( element, LTTextLine): lines.extend(element.get_text().strip()) if (element.get_text().split()[0] == "<END>"): #print(element.get_text().strip()) a = [] table = False #see=False #notes=False #partNo=False #qty=False #partname=False ending = True pno = new_dict["Metadata_%d" % counter]["Part Numbers"][ 1:] #Delete??,check last iter #new_dict["Metadata_%d" %counter]["Part Numbers"]=pno check = len(pno) q = new_dict["Metadata_%d" % counter]["QTYs"][1:check + 1] #new_dict["Metadata_%d" %counter]["QTYs"]=q pname = new_dict["Metadata_%d" % counter]["PART NAMEs"][3:check + 3] #new_dict["Metadata_%d" %counter]["PART NAMEs"]=pname if (len(pno) == len(q) and len(pno) == len(pname)): new_dict3 = { i: { "q": j, "p/n": k } for i, j, k in zip(pname, q, pno) } #print(new_dict3) new_dict[ "Metadata_%d" % counter]["Parts/Components"] = new_dict3 #Delete table columns #del new_dict["Metadata_%d" %counter]["Notes"] del new_dict["Metadata_%d" % counter]["Part Numbers"] del new_dict["Metadata_%d" % counter]["QTYs"] del new_dict["Metadata_%d" % counter]["PART NAMEs"] #replace metadata_counter with system name super_list = new_dict["Metadata_%d" % counter]["Super"] sup_idx = [ i for i, item in enumerate(super_list) if re.search('^PART', item) ] sn_idx = [ i for i, item in enumerate(super_list) if re.search('^S/N', item) ] if sup_idx: sup_idx = sup_idx[0] #print(sup_idx) #print(new_dict["Metadata_%d" %counter]["Super"][sup_idx]) new_dict["Metadata_%d" % counter]["Top"] = new_dict[ "Metadata_%d" % counter]["Super"][sup_idx] if sn_idx: sn_idx = sn_idx[0] #print(sn_idx) this_idx = sn_idx - 1 new_dict["Metadata_%d" % counter]["Serial_No"] = new_dict[ "Metadata_%d" % counter]["Super"][sn_idx] newkey = new_dict["Metadata_%d" % counter]["Super"][this_idx] new_dict["Metadata_%d" % counter]["Component"] = newkey new_dict[newkey] = new_dict["Metadata_%d" % counter] del new_dict["Metadata_%d" % counter]["Super"] del new_dict["Metadata_%d" % counter] counter = counter + 1 new_dict["Metadata_%d" % counter] = {} elif (element.get_text().strip()) == "NOTE": a = [] table = True notes = True partNo = False qty = False partname = False see = False ending = False elif (element.get_text().split()[0]) == "PART" and len( element.get_text().split()) > 1: if (element.get_text().split()[1]) == "NUMBER": a = [] partNo = True table = True notes = False qty = False see = False ending = False elif (element.get_text().split()[1]) == "NAME": a = [] partname = True table = True notes = False qty = False see = False ending = False elif (element.get_text().strip().split()[0]) == "QTY": a = [] qty = True partNo = False notes = False partname = False see = False table = True ending = False #elif(element.get_text().strip())=="PART NAME": # a=[]; #partname=True #partNo=False #notes=False #qty=False #see=False elif (element.get_text().strip().split()[0]) == "SEE": #print(element.get_text().strip().split()[0]) a = [] see = True partNo = False notes = False qty = False partname = False table = True ending = False if table == False and element.get_text().split( )[0] != "<END>": a.append(element.get_text().strip()) new_dict["Metadata_%d" % counter]["Super"] = a w = w + 1 if notes and table: a.extend(element.get_text().strip().split('\n')) #new_dict["Metadata_%d" %counter]["Notes"]=a if partNo and table: a.extend(element.get_text().strip().split('\n')) new_dict["Metadata_%d" % counter]["Part Numbers"] = a if qty and table: a.extend(element.get_text().strip().split('\n')) new_dict["Metadata_%d" % counter]["QTYs"] = a if partname and table: a.extend(element.get_text().strip().split('\n')) new_dict["Metadata_%d" % counter]["PART NAMEs"] = a if see and table: a.append(element.get_text().strip().split('\n')) #new_dict["Metadata_%d" %counter]["SEE PAGE"]=a #close open handles converter.close() fake_file_handle.close() if new_dict: return new_dict
def __init__(self): self.__resources_manager = PDFResourceManager() self.__params_manager = LAParams() self.__aggregator = PDFPageAggregator(rsrcmgr=self.__resources_manager, laparams=self.__params_manager) self.__interpreter = PDFPageInterpreter(rsrcmgr=self.__resources_manager, device=self.__aggregator) self.__analyzer = None
class PdfParser(): """ PaperForSaveに対応し,全てのPdfParserはこのクラスを継承する """ def __init__(self, conference_name, start_patterns={"all": re.compile(".*")}, end_patterns={"all": None}, title_position_number=2, parse_page_numbers=[0], column_number=2, paper_data_class=PaperForSave()): """ Parameters ---------- conference_name: str 学会や論文集の名前 start_patterns: dict of patterns Paperオブジェクトに保持するテキストの開始位置の辞書 end_patterns: dict of pattrens Paperオブジェクトに保持するテキストの終了位置の辞書,Noneは最後まで title_position_number: int titleが与えられるtextboxのインデックス(ソート後) parse_page_numbers: list of int パースするページのリスト,Noneは最後まで paper_data_class: Paper class ペーパークラスのオブジェクトをストラテジーとして直接与える. """ self.conference_name = conference_name if set(start_patterns.keys()) != set(end_patterns.keys()): raise ValueError( "start patterns and eend patterns are not correspondding") self.title_position_number = title_position_number self.parse_page_numbers = parse_page_numbers self.column_number = column_number self.paper_data_class = paper_data_class self.start_patterns = start_patterns self.end_patterns = end_patterns # パースに必要なクラスの作成 # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams(detect_vertical=True) # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager(caching=False) # ページを集めるPageAggregatorオブジェクトを作成。 self.device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 self.interpreter = PDFPageInterpreter(resource_manager, self.device) if column_number == 1: self.SortFuncClass = SortTextbox # クラスを変数として保持 elif column_number == 2: self.SortFuncClass = SortTextbox2Column else: raise ValueError("The column rather than two is not defined") def parse(self, pdf_file_path): """ オーバーライドは原則禁止 """ self.pdf_file_name = str(pdf_file_path.stem) # 内部メソッドからの参照用 with open(pdf_file_path, "rb") as f: parse_text = "" parse_text_flag = False # このフラッグがTrueである部分を序論とする for page in PDFPage.get_pages(f, pagenos=self.parse_page_numbers): self.interpreter.process_page(page) # ページを処理する。 layout = self.device.get_result() # LTPageオブジェクトを取得。 text_boxes = find_textboxes_recursively(layout) # text_boxの座標値毎にソート,複数キーのソート # 少なくともこのページは全て読み込む必要があるため,非効率 sort_func = self.SortFuncClass(layout_x0=layout.x0, layout_x1=layout.x1) text_boxes.sort(key=sort_func) info_dict = self.parse_info() paper = self.paper_data_class.parse_by_textboxes( text_boxes, info_dict) return paper def parse_info(self): """ Paperオブジェクトによって要オーバーライド """ info_dict = {} info_dict["conf_name"] = self.conference_name info_dict["pdf_name"] = self.pdf_file_name info_dict["start_patterns"] = self.start_patterns info_dict["end_patterns"] = self.end_patterns info_dict["title_position_number"] = self.title_position_number return info_dict
def pdf2String(request): if request.method == 'POST': print("pdf2String start!") myFile = request.FILES.get("pdf2trans", None) # 获取上传的文件,如果没有文件,则默认为None print(myFile) transfered_str = '文件转换失败' if not myFile: return render(request, 'pdf2String.html', {'transfered_str': transfered_str}) try: transfered_str = '' from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams from pdfminer.pdfinterp import PDFTextExtractionNotAllowed praser = PDFParser(myFile) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() # print(results) transfered_str = transfered_str + results #得到文字后进一步的处理 # re.sub(r'\r\n\s','许相虎',transfered_str) print("before change: \n" + transfered_str) transfered_str = re.sub(r'\n\s', '许相虎', transfered_str) transfered_str = re.sub(r'\s\n', '许相虎', transfered_str) print("after change: \n" + transfered_str) transfered_str = re.sub(r'\n|\r', '', transfered_str) transfered_str = re.sub(r'许相虎', '\n', transfered_str) transfered_str = re.sub(r'\s{4,}', '\n', transfered_str) print("finally: \n" + transfered_str) return render(request, 'pdf2String.html', {'transfered_str': transfered_str}) except: return render(request, 'pdf2String.html', {'transfered_str': transfered_str}) else: return render(request, 'pdf2String.html')