def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default=0, help='Debug (repeat for more verbose debugging)') parser.add_argument( '-p', '--pages', dest='pagenos', action='store', type=str, default='', help= 'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.' ) parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument( '-t', '--type', dest='outtype', action='store', type=str, default='shape', choices=['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag' ) parser.add_argument( '-m', dest='maxpages', action='store', type=int, default=0, help= 'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.' ) parser.add_argument( '-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument( '-o', '--output', dest='outfile', action='store', type=str, default=None, help= 'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.' ) parser.add_argument( '-C', '--no-caching', dest='caching', action='store_false', default=True, help= 'Suppress object caching. This will reduce the memory consumption but also slows down the process.' ) parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument( '-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help= 'Forces to perform layout analysis for all the text strings, including text contained in figures.' ) parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument( '-M', dest='char_margin', action='store', type=float, default=2.0, help= 'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.' ) parser.add_argument( '-L', dest='line_margin', action='store', type=float, default=0.5, help= 'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.' ) parser.add_argument( '-W', dest='word_margin', action='store', type=float, default=0.1, help= 'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.' ) parser.add_argument( '-F', dest='boxes_flow', action='store', type=float, default=0.5, help= 'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).' ) parser.add_argument( '-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices=['exact', 'normal', 'loose'], help= 'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.' ) parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument( '-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument( '--draw-lines', dest='draw_lines', action='store_true', help= "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output." ) parser.add_argument( '--draw-boxes', dest='draw_boxes', action='store_true', help= "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output." ) parser.add_argument( '--draw-blocks', dest='draw_blocks', action='store_true', help= "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output." ) parser.add_argument( '--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help= "If the text is sheared above this limit, reject it. Valid only for the `shape' output." ) parser.add_argument( '--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help= "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output." ) parser.add_argument( '--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help= 'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).' ) parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument( '--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help= r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help= r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help= r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help= r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-limit', dest='indent_limit', action='store', type=float, default=3, help= 'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.' ) parser.add_argument( '--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help= r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help= 'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).' ) parser.add_argument( '--print-stats', dest='print_stats', action='store_true', default=False, help= 'Instead of the text, output some simple statistics about the file.') parser.add_argument( '--max-blocks', dest='max_blocks', action='store', default=0, type=int, help= 'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.' ) parser.add_argument( '--max-textlines', dest='max_textlines', action='store', default=0, type=int, help= 'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.' ) parser.add_argument( '--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices=['bbox', 'mean', 'median'], help= 'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.' ) parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update(int(x) - 1 for x in args.pagenos.split(',')) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
#pip install pdf miner from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter from pdfminer.layout import LAParams import io pdf_path = 'C:\\Users\somepath\filename.pdf' #path to your pdf file pdf = open(pdf_path, 'rb') mem = io.StringIO() lp = LAParams() rm = PDFResourceManager() cnv = TextConverter(rm, mem, laparams=lp) ip = PDFPageInterpreter(rm, cnv) for i in PDFPage.get_pages(pdf): ip.process_page(i) text = mem.getvalue() file = open("F:\\AIB\\covertedtext.txt", 'wb') #path to your destination file file.write(text.encode('utf-8')) print("DONE")
def parse(self, path, filename): print('----------------------------------------------------------') print('查找文档:' + filename) writepath = self.copy_excel('s' + filename.rsplit('.', 1)[0]) wb = copy(xlrd.open_workbook(writepath, formatting_info=True)) ws = wb.get_sheet(0) fp = open(path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 设置参数 # 循环遍历列表,每次处理一个page的内容 flag = False count = 0 _page_finance = 0 _page_agency = 0 _page_overview = 0 _page_money = 0 _count__read_page = 0 layouts = [] _mat_shiyi = False for page in doc.get_pages(): # doc.get_pages() 获取page列表 if _count__read_page == 3: ws = wb.get_sheet(7) self.findPartyConcernedMsg(layouts, writepath, wb, ws) break interpreter.process_page(page) layout = device.get_result() count += 1 if _count__read_page != 0: _count__read_page += 1 layouts.append(layout) continue # _read_row = 0 for x in layout: if isinstance(x, LTTextBoxHorizontal): # _read_row += 1 results = x.get_text().replace(" ", "").replace('\n', '').strip() if self.locationYW(results): print(results) # if re.match(r'[\w\W]*第一节[\W]*释义', results): # _mat_shiyi = True # if not _mat_shiyi and _read_row > 5: # break # if filename == '晶丰明源.pdf': # print(results) if re.match('[\w\W]*目录+[\w\W]*', results): break if self.locationOverview(results): print('找到概览,在第 %d 页' % count) _page_overview = count self.findOverviewMsf(layout, writepath, wb, ws) break if self.findPartyConcerned(results): print('找到中介机构,在第 %d 页 ' % count) _count__read_page += 1 layouts.append(layout) _page_agency = count self.split_pdf(_page_finance, _page_money, _page_agency, _page_overview, path, self.create_spdf_dir(filename)) break mat = re.search(r'[二|三|四|五]+\、[\w\W]*主要财务数据[\w]*', results) _mat = re.search(r'[\w\W]+\、募集资金[\w\W]*(用途|运用)[\w\W]*', results) if _mat is not None: _page_money = count print("找到募集资金 在第 %d 页" % count) # 募集资金用途:在第三节财务数据的后边。 if mat is not None: if self.findCurrentLiabilities(layout): # locationSuccess = True _page_finance = count # split_pdf(count, path, create_spdf_dir(filename)) print('找到财务信息,在第 %d 页' % count) break else: flag = True break # 结束当前页 else: if flag: if self.findCurrentLiabilities(layout): # locationSuccess = True _page_finance = count # split_pdf(count, path, create_spdf_dir(filename)) print('找到财务信息,第 %d 页' % count) break else: flag = False
def get_report_startpage(pdf): """获取财务报表在文件内的起始页 Arguments: pdf {[str]} -- [pdf文件路径] Returns: start_page[int] -- [业务报表的起始页] """ getp= pdfplumber.open(pdf) total=len(getp.pages) #用于判断当前页是否在前10页 count=0 #存储报表的起始页 start_page=1 #是否是年度报告之类的文件标志 flag=False #创建一个pdf资源管理对象,存储共享资源 rsrcmgr = PDFResourceManager() laparams = LAParams() codec = 'utf-8' outfp = StringIO() #创建device对象 device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams) if total>30: print('总页数',total) with open(pdf, 'rb') as fp: #处理页面内容 interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() #遍历pdf中的每一页 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): count+=1 teststr='' interpreter.process_page(page) teststr=outfp.getvalue() #第一页有无年/季度报告文字,若没有,则无需查找起始页 rs=re.search('(年\s*|季\s*)度?\s*报\s*告?',teststr) #print(teststr) if rs!=None and count==1: #第一张找到年报相关文字,在下一页查找目录二字 flag=True continue elif rs==None and count==1: #第一页未找到年/季报相关文字,查找第二页 #有的报告第一张具有印章,导致提取文字不全 print('第1页未检测到年/季报等文字,检测第二页') continue elif rs!=None and count==2: #第二页找到了年报相关文字,在第三页查找目录 flag=True continue elif rs==None and count==2: #如果第1页和第二页还是没找到年/季报字眼,则认为不是年/季度报文件 if flag==False: device.close() outfp.close() print('当前文件的财务报表起始页为',start_page) return start_page #如果第一页或第二页出现年报或季度报告字眼,则在前10页查找目录页 if flag==True: #1 对前10页进行处理 if count<11: #查找目录页 if re.search('目\s*录',teststr,flags=0): #查看含有目录两字的当前页中是否具有财务报表相关的目录名 #reg_stmt = re.compile(r'财务报告\D{10,}(\d{1,3})') ret = re.search('财务报告\s*(.)*\d', teststr) if ret!=None: ret=ret.group() #去除空格 tstr=[y.strip() for y in re.split(r'[…¨ .]',ret) if len(y)!=0] #第一个值未目录名,第二个值为页码 start_page=int(tstr[1]) device.close() outfp.close() print('当前文件的财务报表起始页为',start_page) return start_page else: #含有目录两字的当前页未找到财务报表相关文字,对下一页处理 count+=1 continue else: #当前页未找到目录文字,继续判断下一页 print('第',count,'页未找到目录二字,查找下一页') continue else: print('10页内未找到目录二字') #10页内未找到目录页,则退出循环 break else: #不超过30页不处理 print('当前文件的财务报表起始页为',start_page) return start_page device.close() outfp.close() print('当前文件的财务报表起始页为',start_page) return start_page
def createDeviceInterpreter(): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter
from pdfminer.pdfparser import PDFParser from pdfminer.pdfpage import PDFPage from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from subprocess import call from pdfminer.layout import LAParams import os url = 'http://www.ird.gov.hk/chi/pdf/c_s88list.pdf' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] pdfdata = opener.open(url).read() file = open('document.pdf', 'wb') file.write(pdfdata) file.close() call('qpdf --password= --decrypt {0}/document.pdf {0}/decrypted.pdf'.format( os.getcwd()).split()) outfp = open('modifiedla.txt', 'w') parser = PDFParser(open('decrypted.pdf', 'rb')) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams(char_margin=10) device = TextConverter(rsrcmgr, outfp, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) outfp.close()
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def request_pdf(url, case_id, court_name): try: response = requests.request("GET", url, verify=False, proxies=proxy_dict) if response.status_code == 200: res = response.text if res is None: logging.error("No data for: " + str(case_id)) return "NULL" file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(case_id) + ".pdf" fw = open(file_path, "wb") fw.write(response.content) text_data = "" pdf_manager = PDFResourceManager() string_io = StringIO() pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text) for page in PDFPage.get_pages(open(file_path, 'rb')): interpreter.process_page(page) text_data = string_io.getvalue() file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(case_id) + ".txt" fw = open(file_path, "w") fw.write(str(text_data)) return str(text_data) else: logging.error("Failed to get text file for: " + str(case_id)) return "NULL" except Exception as e: logging.error("Failed to get pdf file for: " + str(case_id) + ". Error: %s", e) return "NULL"
def __init__(self, pdf_stream, password='', pagenos=[], maxpages=0): ReaderBackend.__init__(self) self.pdf_stream = pdf_stream # Extract Metadata parser = PDFParser(pdf_stream) doc = PDFDocument(parser, password=password, caching=True) if doc.info: for k in doc.info[0]: v = doc.info[0][k] # print(repr(v), type(v)) if isinstance(v, (bytes, str, unicode)): self.metadata[k] = make_compat_str(v) elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)): self.metadata[k] = make_compat_str(v.name) # Secret Metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() # print(metadata) # The raw XMP metadata # print(xmp_to_dict(metadata)) self.metadata.update(xmp_to_dict(metadata)) # print("---") # Extract Content text_io = BytesIO() rsrcmgr = PDFResourceManager(caching=True) converter = TextConverter(rsrcmgr, text_io, codec="utf-8", laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, converter) self.metadata["Pages"] = 0 self.curpage = 0 for page in PDFPage.get_pages(self.pdf_stream, pagenos=pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=False): # Read page contents interpreter.process_page(page) self.metadata["Pages"] += 1 self.curpage += 1 # Collect URL annotations # try: if page.annots: refs = self.resolve_PDFObjRef(page.annots) if refs: if isinstance(refs, list): for ref in refs: if ref: self.references.add(ref) elif isinstance(refs, Reference): self.references.add(refs) # except Exception as e: # logger.warning(str(e)) # Remove empty metadata entries self.metadata_cleanup() # Get text from stream self.text = text_io.getvalue().decode("utf-8") text_io.close() converter.close() # print(self.text) # Extract URL references from text for url in extractor.extract_urls(self.text): self.references.add(Reference(url, self.curpage)) for ref in extractor.extract_arxiv(self.text): self.references.add(Reference(ref, self.curpage)) for ref in extractor.extract_doi(self.text): self.references.add(Reference(ref, self.curpage))
def __init__( self, file, merge_tags=('LTChar', 'LTAnno'), round_floats=True, round_digits=3, input_text_formatter=None, normalize_spaces=True, resort=True, parse_tree_cacher=None, laparams={ 'all_texts': True, 'detect_vertical': True }, ): # store input self.merge_tags = merge_tags self.round_floats = round_floats self.round_digits = round_digits self.resort = resort # set up input text formatting function, if any if input_text_formatter: self.input_text_formatter = input_text_formatter elif normalize_spaces: r = re.compile(r'\s+') self.input_text_formatter = lambda s: re.sub(r, ' ', s) else: self.input_text_formatter = None # open doc if not hasattr(file, 'read'): try: file = open(file, 'rb') except TypeError: raise TypeError("File must be file object or filepath string.") parser = PDFParser(file) if hasattr(QPDFDocument, 'set_parser'): # pdfminer < 20131022 doc = QPDFDocument() parser.set_document(doc) doc.set_parser(parser) else: # pdfminer >= 20131022 doc = QPDFDocument(parser) parser.set_document(doc) if hasattr(doc, 'initialize'): # as of pdfminer==20140328, "PDFDocument.initialize() method is # removed and no longer needed." doc.initialize() self.doc = doc self.parser = parser self.tree = None self.pq = None self.file = file if parse_tree_cacher: self._parse_tree_cacher = parse_tree_cacher self._parse_tree_cacher.set_hash_key(self.file) else: self._parse_tree_cacher = DummyCache() # set up layout parsing rsrcmgr = PDFResourceManager() if type(laparams) == dict: laparams = LAParams(**laparams) self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) # caches self._pages = [] self._pages_iter = None self._elements = []
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda: {}) for child in self.cur_item._objs: # <-- changed if isinstance(child, LTChar): (_, _, x, y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) # doc.set_parser(parser) # doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) pagenos = set() rotation = 0 i = 1 for page in PDFPage.get_pages(fp, pagenos): page.rotate = (page.rotate + rotation) % 360 outfp.write("START PAGE %d\n" % i) interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) i += 1 # for i, page in enumerate(doc.get_pages()): # outfp.write("START PAGE %d\n" % i) # if page is not None: # interpreter.process_page(page) # outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def parse(): # 以二进制读模式打开 fb = open(path, 'rb') # 创建一个pdf文档分析器 parser = PDFParser(fb) # 创建一个pdf文档对象 doc = PDFDocument() # 连接分析器与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码,就创建一个空字符串 doc.initialize() obj = {} amount = 0 # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # pdf 资源管理器,来管理共享资源 resource = PDFResourceManager() # 参数分析器 laparam = LAParams() # 聚合器 device = PDFPageAggregator(resource, laparams=laparam) # 创建PDF解释器 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一个page的内容 doc.get_pages() 获取page列表 for index, page in enumerate(doc.get_pages()): # if index < 3: # continue # if index == 4: # break # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() for out in layout: if hasattr(out, "get_text"): # print(out.get_text()) # 去除无法识别的文字转化成的 (cid:12) 之类的代码 t = re.sub(r'\(cid:[\d]*\)', '', out.get_text()) # 去除特殊内容,如数字、's、'm、're、n't tx = re.sub(r'(\d+|\'s|\'m|\'re|n\'t)', '', t) # 去除标点符号,且将多个空格转化为一个空格 txt = re.sub( r'[\s+\?\.\!\/_,`:;\-$%^*\[\]\{\})(+\"\']+|[+——!,。?、‘’“”~@#¥%……&*():]+', ' ', tx) for word in txt.split(): # 跳过非英语单词 if not is_english(word): continue # 将单词转化为小写 w = word.lower() amount = amount + 1 if obj.__contains__(w): obj[w] = obj[w] + 1 else: obj[w] = 1 db = connect() # 获取会话指针 cursor = db.cursor() # 创建表 cursor.execute('CREATE TABLE IF NOT EXISTS ' + tablename + '(word varchar(255) NOT NULL, ' + 'count int NOT NULL, probability float NOT NULL, ' + 'PRIMARY KEY (word))') # 清空 words 表,避免受前一次计算结果影响 cursor.execute('truncate table ' + tablename) for key in obj: # 创建一条sql语句 sql = 'REPLACE INTO ' + tablename + ' (word, count, probability) VALUES(%s, %s, %s)' # 执行sql语句 cursor.execute(sql, (key, obj[key], round(obj[key] / amount * 10000, 2))) # 提交 db.commit() # 断开数据库连接 db.close() print("总词数: %s" % amount)
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def get_text_from_pdf(path, page_nums=None): r = [] fp = open(path, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp, pagenos=page_nums) def parse_obj(lt_objs): # https://stackoverflow.com/questions/31819862/python-pdf-mining-get-position-of-text-on-every-line # loop over the object list for obj in lt_objs: if isinstance(obj, LTTextLine): x1, y1, x2, y2 = obj.bbox assert x1 < x2 assert y1 < y2 y1 = 1400 - y1 y2 = 1400 - y2 y1, y2 = y2, y1 text = obj.get_text() width = obj.width height = obj.height text = text.replace('東久留米武蔵村山', '東久留米 武蔵村山') # HACK! for line_i, line in enumerate( text.split('\n')): # CHECK WHETHER THIS IS NEEDED! for word_j, word in enumerate(line.split()): each_height = height / text.count('\n') i_y1 = y1 + each_height * line_i i_y2 = y2 + each_height * (line_i + 1) each_width = width / len(line.split()) i_x1 = x1 + each_width * word_j i_x2 = x2 + each_width * (word_j + 1) r.append( TextItem(text=word, x1=i_x1, y1=i_y1, x2=i_x2, y2=i_y2, width=each_width, height=each_height)) # if it's a textbox, also recurse if isinstance(obj, LTTextBoxHorizontal): parse_obj(obj._objs) # if it's a container, recurse elif isinstance(obj, LTFigure): parse_obj(obj._objs) for page in pages: print('Processing next page...') interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): parse_obj(lobj) for xx in range(5): dists = [] for x in range(len(r)): for y in range(len(r)): text_item_1 = r[x] text_item_2 = r[y] dists.append((abs(text_item_1.y1 - text_item_2.y1), x, y)) merged = set() for dist, x, y in sorted(dists): text_item_1 = r[x] text_item_2 = r[y] text_1_num = all(i.isnumeric() or i in ',()' for i in text_item_1.text.strip()) text_2_num = all(i.isnumeric() or i in ',()' for i in text_item_2.text.strip()) if not dist: continue elif text_1_num != text_2_num: continue elif y in merged: continue merged.add(y) if dist <= 18: # NOTE ME: This threshold may need to be tuned!!! ===================================== r[y] = TextItem(text=text_item_2.text, x1=text_item_2.x1, y1=text_item_1.y1, x2=text_item_2.x2, y2=text_item_1.y1 + text_item_2.height, width=text_item_2.width, height=text_item_2.height) r.sort(key=lambda x: (x.y1, x.x1, x.x2, x.y2)) #for i in r: # print(i) return r
def arc(): destino = str(formato.get()) if destino == "Arquivo do Word": destino = "docx" if destino == "Arquivo do Power-Point": destino = "ppt" if destino == "Arquivo do Excel": destino = "xlsx" if destino == "Arquivo de Texto": destino = "txt" import win32com.client as win32 from os import path in_file = path.abspath(diretorio) out_file = path.abspath(filename) if destino == "docx": if file_extension in ArqDOCX or file_extension.lower( ) == ".pdf" or file_extension.lower() == ".txt": word = win32.DispatchEx("Word.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Documents.Open(in_file) doc.SaveAs(out_file, FileFormat=16) doc.Close() word.Quit() elif destino.lower() == "pdf": if file_extension.lower() in ArqPPT: word = win32.DispatchEx("PowerPoint.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Presentations.Open(in_file) doc.SaveAs(out_file, FileFormat=32) doc.Close() word.Quit() elif file_extension.lower() in ArqXLSX: word = win32.DispatchEx("Excel.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Workbooks.Open(in_file) doc.ExportAsFixedFormat(0, out_file) doc.Close() word.Quit() elif file_extension.lower() in ArqDOCX or file_extension.lower( ) == ".txt": word = win32com.client.Dispatch('Word.Application') word.Visible = 0 word.DisplayAlerts = 0 doc = word.Documents.Open(in_file) doc.SaveAs(in_file, FileFormat=17) doc.Close() word.Quit() elif destino.lower() == "xlsx": if file_extension.lower() == ".pdf": import pdftables_api c = pdftables_api.Client('to7jluln0hvr') c.xlsx(diretorio, filename + '.xlsx') elif file_extension.lower() == ".txt" or file_extension.lower( ) in ArqDOCX: import pandas as pd df = pd.read_csv(diretorio, header=None, delim_whitespace=True) df.to_excel(filename + '.xlsx', index=False, header=None) elif destino.lower() == "txt": if file_extension in ArqDOCX: import docx2txt text = docx2txt.process(diretorio) with open(filename + ".txt", "w") as file: print(text, file=file) elif file_extension.lower() == ".pdf": from io import StringIO from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage output_string = StringIO() with open(diretorio, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) with open(filename + ".txt", "w") as final: final.write(output_string.getvalue()) elif file_extension.lower() in ArqXLSX: import pandas as pd read_file = pd.read_excel(diretorio, header=None) read_file.to_csv(filename + ".txt", index=None, header=True) messagebox.showinfo( "Formato convertido", "Formato de ficheiro convertido com sucesso.\n\n" + file_extension[1:].upper() + " para " + destino.upper() + "\n\nSalvo em: " + out_file + "." + destino) root.destroy()
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from MoinMoin import log logging = log.getLogger(__name__) LAPARAMS = LAParams( # value is specified not as an actual length, but as a proportion of the length to the # size of each character in question. # two text chunks whose distance is closer than the char_margin is considered # continuous and get grouped into one. char_margin=0.3, # it may be required to insert blank characters (spaces) as necessary if the distance # between two words is greater than the word_margin, as a blank between words might # not be represented as a space, but indicated by the positioning of each word. word_margin=0.2, # two lines whose distance is closer than the line_margin is grouped as a text box, # which is a rectangular area that contains a "cluster" of text portions. line_margin=0.3, ) class UnicodeConverter(TextConverter): # as result, we want a unicode object # TextConverter only provides encoded output into a file-like object def __init__(self, rsrcmgr, pageno=1, laparams=None, showpageno=False): TextConverter.__init__(self, rsrcmgr, None,
def parse(): # rb以二进制读模式打开本地pdf文件 fn = open('半监督模糊聚类及其应用_杨昔阳.pdf', 'rb') # 创建一个pdf文档分析器 parser = PDFParser(fn) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 resource = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) # 创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) # 使用聚合器get_result()方法获取内容 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: # 判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): print(out.get_text()) with open('test.txt', 'a') as f: f.write(out.get_text() + '\n') if __name__ == '__main__': parse()
def Converting_Function(Path_To_TXTs, new_file): """ :param Path_To_TXTs: path to PDFs or/and XML files :param new_file: the path to save the TXT format """ files_short = np.array([ f for f in os.listdir(Path_To_TXTs) if os.path.isfile(os.path.join(Path_To_TXTs, f)) ]) files = np.array([Path_To_TXTs + '/' + f for f in files_short]) for file in files: if file.endswith('.pdf'): Not_Good = False Prob = False try: fp = open(file, 'rb') parser_pdf = PDFParser(fp) doc = PDFDocument(parser_pdf) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) device.get_result() rows = device.rows lines = [item[5] for item in rows] if average_len(lines) >= 20: try: text_all = convert_pdf_to_txt(file, pages=[0]) rows_pages = [item for item in rows if item[0] != 0] words = [item[1] for item in rows_pages] words_1 = [item for item in words if item <= 200] words_2 = [item for item in words if item > 200] first = most_common(words_1) second = most_common(words_2) pages = [item[0] for item in rows_pages] pages = list(set(pages)) pages.sort() for page in pages: page_lines = [ line for line in rows_pages if line[0] == page ] text1 = '' text2 = '' text_middle = '' for item in page_lines: if item[1] <= (first + 20) and not ( item[5].isdigit() and not item[5].endswith('.')): text1 = text1 + '\n' + item[5] elif item[1] >= ( second - 20) and item[1] <= 500 and not ( item[5].isdigit() and not item[5].endswith('.')): text2 = text2 + '\n' + item[5] else: if not (item[5].isdigit() and not item[5].endswith('.')): text_middle = text_middle + '\n' + item[ 5] if len(text1 + text2) > len(text_middle): text_all = text_all + text1 + text_middle + text2 else: Not_Good = True if len(text_all) >= 1500 and Not_Good == False: text_all = text_all.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') elif len(text_all) >= 1500 and Not_Good == True: rawText = parser.from_file(file) text = rawText['content'] text = os.linesep.join( [s for s in text.splitlines() if s]) text_all = text.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') text_all = " ".join(text_all.split()) name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: raw = parser.from_file(file) text_all = raw['content'] text_all = "\n".join([ ll.rstrip() for ll in text_all.splitlines() if ll.strip() ]) if len(text_all) >= 1500: text_all = text_all.replace( ' ac.', '~').replace(' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: pass #print('The PDF "' + file + '" contain less than 1500 characters !!!') except: Prob = True elif average_len(lines) < 20 or Prob == True: raw = parser.from_file(file) text_all = raw['content'] text_all = "\n".join([ ll.rstrip() for ll in text_all.splitlines() if ll.strip() ]) if len(text_all) >= 1500: text_all = text_all.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: pass #print('The PDF "' + file + '" contain less than 1500 characters !!!') except: Prob = True if Prob == True: raw = parser.from_file(file) text_all = raw['content'] text_all = "\n".join([ ll.rstrip() for ll in text_all.splitlines() if ll.strip() ]) if len(text_all) >= 1500: text_all = text_all.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: pass #print('The PDF "' + file + '" contain less than 1500 characters !!!') elif file.endswith('.xml'): text_all = get_text_from_XML_without_saving(file) text_all = text_all.split('competing financial interest')[0] text_all = text_all.replace(' ac.', '~').replace(' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close()
def get_pdf_rows(data, miner_layout=True): """ Takes PDF file content as string and yield table row data for each page. For each page in the PDF, the function yields a list of rows. Each row is a list of cells. Each cell is a list of strings present in the cell. Note that the rows may belong to different tables. There are no logic tables in PDF format, so this parses PDF drawing instructions and tries to find rectangles and arrange them in rows, then arrange text in the rectangles. External dependencies: PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html). """ try: from pdfminer.pdfparser import PDFParser, PDFSyntaxError except ImportError: raise ImportError('Please install python-pdfminer') try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if miner_layout: device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) else: device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.get_pages(BytesIO(data), check_extractable=True) else: doc.initialize() pages = doc.get_pages() if LOGGER.isEnabledFor(DEBUGFILES): import tempfile import PIL.Image as Image import PIL.ImageDraw as ImageDraw import random path = tempfile.mkdtemp(prefix='pdf') for npage, page in enumerate(pages): LOGGER.debug('processing page %s', npage) interpreter.process_page(page) page_layout = device.get_result() texts = sum([ list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar)) ], []) LOGGER.debug('found %d text objects', len(texts)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for t in texts: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color) draw.text((t.x0, t.y0), t.text.encode('utf-8'), color) fpath = '%s/1text-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) if not miner_layout: texts.sort(key=lambda t: (t.y0, t.x0)) # TODO filter ltcurves that are not lines? # TODO convert rects to 4 lines? lines = [ lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve)) ] LOGGER.debug('found %d lines', len(lines)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for l in lines: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color) fpath = '%s/2lines-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) lines = list(uniq_lines(lines)) LOGGER.debug('found %d unique lines', len(lines)) rows = build_rows(lines) LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for r in rows: for b in r: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) fpath = '%s/3rows-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) textrows = arrange_texts_in_rows(rows, texts) LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for row, trow in zip(rows, textrows): for b, tlines in zip(row, trow): color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color) fpath = '%s/4cells-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) yield textrows device.close()
def parse_webpages(webpages): for page in webpages: # obtain the robots.txt url r = Robots.robots_url(page) robots = Robots.fetch(r) if (robots.allowed(page, '*')): # sitemaps is a list of all the sitemaps for a website sitemaps = robots.sitemaps sitemaps_list = list(sitemaps) html = requests.get(page) # html of the webpage soup = bs4.BeautifulSoup(html.text, "html.parser") outlinks = soup.find_all("a") # all the outlinks links = [str(i.get('href')) for i in outlinks] outlinks = [str(i) for i in outlinks] docs = [] # the documents on the page for file in links: directory = page.rsplit("/", 1)[0] link = directory + '/' + file # can be expanded to other file types with a comma if file.endswith(('txt', 'md')): if file.startswith(('http://', 'www.')): text = bs4.BeautifulSoup( requests.get(file).text, "html.parser") ext = file.rsplit(".", 1)[-1] text = [file, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) else: text = bs4.BeautifulSoup( requests.get(link).text, "html.parser") ext = link.rsplit(".", 1)[-1] text = [link, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) elif file.endswith(('pdf')): # special case if PDF x = file try: if file.startswith(('http://', 'www.')): pdf = file.rsplit("/", 1)[-1] response = urlopen(file) else: pdf = file.rsplit("/", 1)[-1] # must first check if pdf is found response = urlopen(link) except urllib.error.HTTPError as e: # if 404 error, put 404 as text text = [link, "pdf", "404"] # text = {'link': link, 'ext': 'pdf', 'text': "404"} docs.append(text) else: # otherwise must save the pdf to run pypdf2 file = open(pdf, 'wb') file.write(response.read()) file.close() if x.startswith('http://'): link = x txt = "" file = open(pdf, 'rb') parser = PDFParser(file) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for p in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(p) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): txt += lt_obj.get_text() # close the pdf file file.close() name = [link, "pdf", txt] # name = {'link': link, 'ext': 'pdf', 'text': txt} os.remove(pdf) # remove the saved file when done docs.append(name) docs = [[str(i) for i in lis] for lis in docs] timestamp = datetime.datetime.now().isoformat() output = { 'url': page, 'timestamp': timestamp, 'outlinks': outlinks, 'html': html.text, 'docs': docs, 'sitemaps': sitemaps_list } with Crawling_L_REST.app.app_context(): Crawling_L_REST.add_webpage(output) return output
dic = {'文件名': file, '证券简称': name, '证券代码': code} # 获取文档对象 fp = open('pdf/' + file, 'rb') # 创建一个与文档关联的解释器 parser = PDFParser(fp) # pdf文档的对象 doc = PDFDocument() # 链接解释器和文档对象 parser.set_document(doc) doc.set_parser(parser) # 初始化文档 doc.initialize('') # 创建PDF资源管理器 resource = PDFResourceManager() # 参数分析器 las = LAParams() # 创建一个聚合器 device = PDFPageAggregator(resource, laparams=las) # 创建PDF页面解释器 interpreter = PDFPageInterpreter(resource, device) # 使用文档对象得到页面的集合 # pdf 内容 pdf_text = '' for page in doc.get_pages(): # page 内容 page_text = '' # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器来获得内容 layout = device.get_result()
def get_text_box(pdf_path): """ :return: trả về list các box theo từng page ở dạng như thế này với region là danh sách tọa độ củ các block text còn media box là tọa độ size của từng page điểm x0=0, y0=0 [ { #page 1 "region": [ { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" } ], "media_box":[ x1, y1 ] }, { #page 2 "region": [ { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" } ], "media_box":[ x1, y1 ] } ] """ fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) list_all_box = [] # loop over all pages in the document for page in PDFPage.create_pages(document): list_item = {} interpreter.process_page(page) layout = device.get_result() media_texbox = (int(page.mediabox[2]), int(page.mediabox[3])) MEDIA_Y1 = int(page.mediabox[3]) sub_box = parse_obj(layout._objs, MEDIA_Y1) list_item['region'] = sub_box list_item['media_box'] = media_texbox list_all_box.append(list_item) return list_all_box
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser from AIDataConverter import AIDataConverter from LegalDocMLconverter import LegalDocMLconverter from XMLconverter import XMLConverter from PDFMinerconverter import PDFMinerConverter input_path = "../sample/NZBC-G4#3.4_13.pdf" output_path = "../output/output.xml" output_path2 = "../output/sample.xml" img_output_path = "../output/pdfminer_page1.jpg" outfp = open(output_path, "wb") with open(input_path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() #device = AIDataConverter(rsrcmgr, outfp, laparams=LAParams()) device = LegalDocMLconverter(rsrcmgr, outfp, laparams=LAParams()) #device = XMLConverter(rsrcmgr, outfp, laparams=LAParams()) #device = PDFMinerConverter(rsrcmgr, outfp, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) #device.draw_layout(input_path, img_output_path) device.close() outfp.close()
def convert_to_text(fname): pages = None if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close text_list = text.split('\n') txt = text_list[:3] text = ' '.join(text_list[3:]) print("###################") print(txt) ## spliting word from string word_list = text.split(' ') string_input = "" flag = 0 for word in word_list: # print("*********") # print(word) if (word.lower() == 'tran'): break else: if (word.lower() == 'customer' or word.lower() == 'scheme' or word.lower() == 'currency' or word.lower() == 'for'): word = '\n' + word elif (word.lower() == 'statement'): word = '\n' + word flag = 1 elif (word.lower() == 'account' and flag == 1): word = '\n' + word string_input += word + " " print("::::::::::::::::::::::") # print(string_input) file_name = fname.split('/')[-1] file_name = file_name.split('.')[0] # print(file_name) # write Content to .txt text_file = open( "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/txt/output_" + file_name + ".txt", "w") text = re.sub("\s\s+", " ", text) text_file.write("%s" % text) text_file.close() file_name_main = "output_" + file_name + ".csv" csv_file = open( "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/csv/" + file_name_main, "w") text = re.sub("\s\s+", " ", string_input) csv_file.write("%s" % string_input) csv_file.close() length_lines = len(string_input.split('\n')) # print("-----------",length_lines) convert_to_table(fname, string_input, txt)
def parse_pdf(pdfFile, debug, stats, current_task=None): stats[0] = 0 stats[1] = 0 reader = PdfReader(pdfFile) if debug: debugFile = os.path.splitext(pdfFile)[0] + "-debug.pdf" pdf_canvas = canvas.Canvas(debugFile) form = pdf_canvas.acroForm pdf_canvas.setStrokeColor(black) pdf_canvas.setLineWidth(0.1) colors = [black, red, green, blue] pages = [] for page_layout in extract_pages(pdf_file=pdfFile, laparams=LAParams(line_margin=0, char_margin=0.5)): pageNum = page_layout.pageid pageWidth = page_layout.bbox[2] pageHeight = page_layout.bbox[3] if current_task != None: current_task.update_state(state='PROGRESS', meta="Extracting page " + str(pageNum) + " of " + str(len(reader.pages))) # extract texts and lines textLines = [] lines = [] for element in page_layout: extract_elements(textLines, lines, element) # process lines merge_lines(lines) split_lines(lines) extract_line_features(lines, textLines, pageHeight) # match fields if (pageNum > len(reader.pages)): return pages pdf_page = reader.pages[pageNum - 1] if pdf_page.Annots and len(pdf_page.Annots) > 0: match_fields(pdf_page, lines, stats) pages.append(lines) # dump debugging info if debug: pdf_canvas.setPageSize((pageWidth, pageHeight)) # dump text lines for textLine in textLines: left = textLine.Position.Left right = textLine.Position.Right top = textLine.Position.Top bottom = textLine.Position.Bottom form.textfield(value=textLine.Text, x=left, y=bottom, width=right - left, height=top - bottom, borderWidth=0, fontSize=7) # dump lines ci = 0 i = 1 for line in lines: pdf_canvas.setStrokeColor(colors[ci]) ci = (ci + 1) % len(colors) pdf_canvas.setLineWidth(line.LineWidth) pdf_canvas.rect(line.Position.Left, line.Position.Bottom, line.Position.Right - line.Position.Left, line.Position.Top - line.Position.Bottom) # dump fields if (line.IsHorizontal): value = "line_" + str(i) if line.IsMarkupField: value = line.FieldCode form.textfield(value=value, x=line.FieldPosition.Left, y=line.FieldPosition.Bottom, width=line.FieldPosition.Right - line.FieldPosition.Left, height=line.FieldPosition.Top - line.FieldPosition.Bottom, borderWidth=0, fontSize=7) i += 1 pdf_canvas.showPage() if debug: pdf_canvas.save() return pages
def main(argv): for arg in argv[1:]: fd = open(arg, 'rb') parser = PDFParser(fd) document = PDFDocument(parser) if not document.is_extractable: print("Document not extractable.") return 1 params = LAParams(char_margin=1) resMan = PDFResourceManager(caching=True) device = PDFPageAggregator(resMan, laparams=params) interpreter = PDFPageInterpreter(resMan, device) parser = x86ManParser("html", params) i = 1 for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True): print("Processing page %i" % i) interpreter.process_page(page) page = device.get_result() parser.process_page(page) i += 1 parser.flush() fd.close() # RG: We get the parse in just one file: html/AAA.html # Looks like the original repo does not create all the separate pages for all the # instructions nor the index.html # Dus gewoon een grote parse in een file met als filename de eerste instructie. # En later heeft ie de losse pagina's daar dan weer uit gehaald door te zoeken op <h3> # en de closing tags toe te voegen. # En zo ook een index.html gemaakt en zelf een style.css erbij. # NOTE_: we are getting 3 sorts of Description: <p>, <table> and <svg> # Op zijn website is het alleen <p> dus hij heeft zeker nog nabewerkingen gedaan. # Ook toevoegen van <pre> en <h2> etc. # Dit is dus alleen een grove parse om de tekst en tabellen eruit te krijgen. # Gezien de issues (e.g. problemen met footnotes in MOV) is het waarschijnlijk beter om # te gaan werken met de html die pdf2txt maakt. Is wel niet zo clean maar geeft minder # problemen. # Kijkend naar de resultaten van alle pdf2html conversies ziet het er naar uit dat het toch # niet zo makkelijk programmatisch gaat. zneak/felix heeft zijn best gedaan en genereert cleane # html maar er zitten toch nog veel fouten in (zie issues). pdf2txt maakt een nette layout maar # in de tabellen gaat het vaak mis en moet je nog veel nabewerken. Het is te vergelijken met # pdf2music: soms lukt het maar meestal ziet het er niet goed uit en kun je beter alles handmatig # doen. Veel werk maar geeft het beste resultaat. # pdf2txt gebruikt trouwens spans voor tabellen. Lelijk. # DONE: checked out pdftohtml from Xpdf. This produces the best looking pages. But also no real # tables. It uses a png file as background for the tables and then lays everything out with # absolutely positioned divs. For exact positioning that seems the way to go. But also slight # mistakes in the table layout. Faster (C++, Qt) and better than pdfminer.six. # But no real tables is faking it... # NOTE_: at autoclose we are getting mismatch: th strong when parsing the full vol2a.pdf # Something goes wrong. # Figures are extracted as svg but often look warped (e.g. Figure 3-18 and 3-19 at HADDPS) # PDF parsing is like unscrambling scrambled eggs... # DONE: checked out pdf2htmlEX. It creates perfectly looking html 5 pages. It can be done! # It is fast and puts everything in one html page. # TODO_: check out https://github.com/fmalina/unilex-transcript which promisses to create # clean (semantic) html from pdf2htmlEX output. # NOTE_: Conversion result geeft altijd 0/0 omdat we niet in de code komen waar success en fail worden # geincrementeerd. Het zijn dus loze variabelen. print("Conversion result: %i/%i" % (parser.success, parser.success + parser.fail))
def get_text_from_pdf(pdfname, limit=1000): if (pdfname == ''): return '' else: # 処理するPDFファイルを開く/開けなければ try: fp = open(pdfname, 'rb') except: return '' # PDFからテキストの抽出 rsrcmgr = PDFResourceManager() out_fp = StringIO() la_params = LAParams() la_params.detect_vertical = True device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) text = out_fp.getvalue() fp.close() device.close() out_fp.close() # 改行で分割する #lines = text.splitlines() lines = [] lines.append(text) outputs = [] output = "" # 除去するutf8文字 replace_strs = [b'\x00'] is_blank_line = False # 分割した行でループ for line in lines: # byte文字列に変換 line_utf8 = line.encode('utf-8') # 余分な文字を除去する for replace_str in replace_strs: line_utf8 = line_utf8.replace(replace_str, b'') # strに戻す line = line_utf8.decode() # 連続する空白を一つにする line = re.sub("[ ]+", " ", line) # 前後の空白を除く line = line.strip() #print("aft:[" + line + "]") # 空行は無視 if len(line) == 0: is_blank_line = True continue # 数字だけの行は無視 if is_float(line): continue # 1単語しかなく、末尾がピリオドで終わらないものは無視 if line.split(" ").count == 1 and not line.endswith("."): continue # 文章の切れ目の場合 if is_blank_line or output.endswith("."): # 文字数がlimitを超えていたらここで一旦区切る if (len(output) > limit): outputs.append(output) output = "" else: output += "\r\n" #前の行からの続きの場合 elif not is_blank_line and output.endswith("-"): output = output[:-1] #それ以外の場合は、単語の切れ目として半角空白を入れる else: output += " " #print("[" + str(line) + "]") output += str(line) is_blank_line = False outputs.append(output) outputs.append('\n') return outputs
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: """ Parse CAS pdf and returns line data. :param filename: CAS pdf file (CAMS or Kfintech) :param password: CAS pdf password :return: array of lines from the CAS. """ file_type: Optional[FileType] = None if isinstance(filename, str): fp = open(filename, "rb") elif hasattr(filename, "read") and hasattr(filename, "close"): # file-like object fp = filename else: raise CASParseError( "Invalid input. filename should be a string or a file like object") with fp: pdf_parser = PDFParser(fp) try: document = PDFDocument(pdf_parser, password=password) except PDFPasswordIncorrect: raise IncorrectPasswordError("Incorrect PDF password!") except PDFSyntaxError: raise CASParseError("Unhandled error while opening file") line_margin = { FileType.KFINTECH: 0.1, FileType.CAMS: 0.2 }.get(detect_pdf_source(document), 0.2) rsrc_mgr = PDFResourceManager() laparams = LAParams(line_margin=line_margin, detect_vertical=True) device = PDFPageAggregator(rsrc_mgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrc_mgr, device) pages: List[Iterator[LTTextBoxHorizontal]] = [] investor_info = None for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() text_elements = filter( lambda x: isinstance(x, LTTextBoxHorizontal), layout) if file_type is None: for el in filter(lambda x: isinstance(x, LTTextBoxVertical), layout): if re.search("CAMSCASWS", el.get_text()): file_type = FileType.CAMS if re.search("KFINCASWS", el.get_text()): file_type = FileType.KFINTECH if investor_info is None: investor_info = parse_investor_info(layout, *page.mediabox[2:]) pages.append(text_elements) lines = group_similar_rows(pages) return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)
'This program replaces the citation links inside a pdf which just goes to the page with the ADS abstract link' ) parser.add_argument('input', help='The input pdf file') parser.add_argument('output', help='The processed output pdf file') args = parser.parse_args() inputPDFDocName = args.input outputPDFDocName = args.output #Standard reciepe document = open(inputPDFDocName, 'rb') #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(document) doc = PDFDocument(parser) #Get links and thier positions, and put that info into custom objects curPage = 0 documentParsed = {} for page in PDFPage.get_pages(document): interpreter.process_page(page) # get the pageid order in the document pageObjIds.append(page.pageid) curPage = getPageNumWithPageObj(page)
def get_pdf_file_content(path_to_pdf): ''' path_to_pdf: is the parameter that will give access to the PDF File we want to extract the content. ''' ''' PDFResourceManager is used to store shared resources such as fonts or images that we might encounter in the files. ''' resource_manager = PDFResourceManager(caching=True) ''' create a string object that will contain the final text the representation of the pdf. ''' out_text = StringIO() ''' UTF-8 is one of the most commonly used encodings, and Python often defaults to using it. In our case, we are going to specify in order to avoid some encoding errors. ''' codec = 'utf-8' """ LAParams is the object containing the Layout parameters with a certain default value. """ laParams = LAParams() ''' Create a TextConverter Object, taking : - ressource_manager, - out_text - layout parameters. ''' text_converter = TextConverter(resource_manager, out_text, laparams=laParams) fp = open(path_to_pdf, 'rb') ''' Create a PDF interpreter object taking: - ressource_manager - text_converter ''' interpreter = PDFPageInterpreter(resource_manager, text_converter) ''' We are going to process the content of each page of the original PDF File ''' for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) ''' Retrieve the entire contents of the “file” at any time before the StringIO object’s close() method is called. ''' text = out_text.getvalue() ''' Closing all the ressources we previously opened ''' fp.close() text_converter.close() out_text.close() ''' Return the final variable containing all the text of the PDF ''' return text