def extract_layout_by_page(pdf_path): """ Extracts LTPage objects from a pdf file. slightly modified from https://euske.github.io/pdfminer/programming.html """ laparams = LAParams() laparams.detect_vertical = True fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layouts.append(device.get_result()) return layouts
def read_by_page(path): """ Function: read_by_page Summary: PDFの素のテキストを読み込み、返却する。ページ毎にリストで取得する。 Attributes: @param (path):PDFのパス Returns: ["page1_text","page2_text",...] """ result = [] rsrcmgr = PDFResourceManager() outfp = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in tqdm(PDFPage.get_pages(fp)): interpreter.process_page(page) # バッファを取得後初期化する page_text = outfp.getvalue() result.append(page_text) outfp.truncate(0) outfp.seek(0) fp.close() device.close() outfp.close() return result
def get_kondate_from_pdf(dir, year, month): # PDFを解析するために必要 resource_manager = PDFResourceManager() layout_params = LAParams() layout_params.detect_vertical = True device = PDFPageAggregator(resource_manager, laparams=layout_params) # PDFファイルを開いてページ単位で読み込み file_path = dir + "/PDFData/" + str(year) + "/" + str(month).zfill( 2) + ".pdf" if not os.path.exists(file_path): raise FileNotFoundError("指定された年月のPDFファイルが存在しません.") kondate_data_all = [] with open(file_path, 'rb') as fp: interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages(fp, maxpages=0, caching=True, check_extractable=True): interpreter.process_page(page) result = device.get_result() text_boxes = find_textbox_recursively(result) text_boxes.sort(key=lambda b: (-b.y1, b.x0)) parsed_data = parse_textboxes(text_boxes) kondate_data = get_kondate_from_parsed_data(year, parsed_data) kondate_data_all.extend(kondate_data) device.close() return kondate_data_all
def parse_pdf_to_txt(pdf_handle, write_file): pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() #laparams.all_texts = True laparams.detect_vertical = True # 创建pdf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager(caching=caching) print("ready to open out file ........") with open(write_file, "wt", encoding=codec, errors='ignore') as outfp: device = XMLConverter(rsrcmgr, outfp, laparams=laparams) print("ready to converte pdf to xml ........") process_pdf(rsrcmgr, device, pdf_handle, pagenos, maxpages=maxpages, password='', caching=caching, check_extractable=True) device.close()
def _convert(input_file, page_id): """ pdf内のページをバラして、そのそれぞれのページ内の テキストボックスのリストを返すメソッド。 """ # Layout Analysisのパラメーターを設定。 laparams = LAParams() laparams.detect_vertical = True codec = "utf-8" resource_manager = PDFResourceManager() device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) with open(input_file, 'rb') as f: # PDFPage.get_pages()にファイルオブジェクトを指定して、page_idに該当するPDFPageを取得 allpages = PDFPage.get_pages(f) allpages = list(allpages) page = allpages[page_id] interpreter.process_page(page) content = device.get_result() # ページ内のテキストボックスのリストを取得&座標が左上にあるほど先にソート。 boxes = _findAndGetBoxes(content) boxes.sort(key=lambda b: (-b.y1, b.x0)) return boxes
def outputText(inputPDFFile, outputTXTFile): # Open a PDF file. pageNum = 1 fp = open(inputPDFFile, 'rb') rsrcmgr = PDFResourceManager() # rettxt = output = StringIO() laparams = LAParams() # Output vertical writing characters horizontally laparams.detect_vertical = True device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" parser = PDFParser(fp) document = PDFDocument(parser, password) charBuf = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for l in layout: # print(l) checkLtFigure(l, pageNum, charBuf) # next page charBuf.append("\n\n") pageNum += 1 # End of for page in PDFPage.create wfp = open(outputTXTFile, 'wt', encoding='UTF-8') buf = ''.join(charBuf) wfp.write(buf) wfp.close() fp.close() device.close()
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # Trueにすることで綺麗にテキストを抽出できる device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() fstr = '' for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str fp.close() device.close() retstr.close() return fstr
def pdf2str(path): #Allocate resources rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() #Set parameters codec = 'utf-8' laparams.all_texts=True laparams.detect_vertical = True caching = True pagenos = set() #Initialize the converter device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) #Open the file and parse fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True): interpreter.process_page(page) #Clean up fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, "rb") parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels}) page_index += 1 fp.close() return result
def pdfsearch(path): rsrcmgr = PDFResourceManager() outfp = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = open(path, 'rb') #open a pdf file # PDFPageInterpreterオブジェクトを作成.PDFDocumentから任意のページのオブジェクトPagePDF を投げると,page contentが処理される interpreter = PDFPageInterpreter(rsrcmgr, device) pagecount = 0 pagelist = [] matchtime = 0 for page in PDFPage.get_pages(fp, maxpages=0): interpreter.process_page(page) # この時点で outfpにページのデータが追加される text = re.sub(r"\s+", "", outfp.getvalue()) if matchtime < len(searchword.findall(text)): pagelist.append(pagecount) matchtime = len(searchword.findall(text)) print(str(pagecount + 1), end=" ") sys.stdout.flush() pagecount = pagecount + 1 fp.close() device.close() outfp.close() print(pagelist) return pagelist
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5): """ Converts a pdf filename into plain text. Each value is specified not as an actual length, but as a proportion of the length to the size of each character in question. Parameters define layout analysis. In a PDF text is in several chunks of various types. Text extraction needs to recover text chunks which ar regarded as continuous if elements distance is closer than the char_margin (identified as M) and thus are grouped into one block. Two lines are part of the same text if they are closer than the line_margin (L). If the distance between two words is greater than the word_margin (W), blank characters (spaces) shall be inserted as necessary to keep format. Boxes flow (F) specifies how much a horizontal and vertical position of a text matters when determining text flow order. The value should be within the range from -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). Keyword arguments: fname -- PDF file name (string) pages -- Set of pages to extract (set) M -- char_margin (float) L -- line_margin (float) W -- word_margin (float) F -- boxes_flow (float) Return: text: pdf contents as plain text """ if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() codec = "utf-8" manager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = False laparams.char_margin = M laparams.line_margin = L laparams.word_margin = W laparams.boxes_flow = F converter = TextConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def to_text(self): rsrcmgr = PDFResourceManager() output = StringIO() laparams = LAParams() laparams.detect_vertical = True laparams.all_texts = True laparams.word_margin = 0.4 device = TextConverter(rsrcmgr, output, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self._doc.get_pages(): interpreter.process_page(page) return output.getvalue().decode('utf-8', 'ignore')
def convert_to_text_file(filename_in, filename_out, rewrite=False): """ Parse file according to BORME PDF format filename: filenameOut: """ if os.path.isdir(filename_out): filename_out = os.path.join(filename_out, os.path.basename(filename_in)) if os.path.exists(filename_out) and not rewrite: logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out) return False # conf codec = 'utf-8' laparams = LAParams() imagewriter = None pagenos = set() maxpages = 0 password = '' rotation = 0 # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False> laparams.detect_vertical = True laparams.all_texts = False laparams.char_margin = 2.0 laparams.line_margin = 0.5 laparams.word_margin = 0.1 caching = True rsrcmgr = PDFResourceManager(caching=caching) outfp = open(filename_out, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = open(filename_in, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) # https://github.com/euske/pdfminer/issues/72 #page = PDFPage() #PDFPage.cropbox = # y esto? for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return True
def pdf_to_txt(path): rsrcmgr = PDFResourceManager(caching=True) laparams = LAParams() laparams.detect_vertical = True fp = open(path, 'rb') outfp = open(path + '.txt', 'w') device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() outfp.close() device.close()
def gettext(filepath): """ PDFファイルを読み取って文字列を返す関数 ほぼコピペ→https://arakan-pgm-ai.hatenablog.com/entry/2018/01/07/080000 """ # PDFファイル名が未指定の場合は、空文字列を返して終了 if filepath == '': return '' # 処理するPDFファイルを開く/開けなければ try: fp = open(filepath, 'rb') except FileNotFoundError as e: print(e) print('Press Enter to exit.') sys.exit() # リソースマネージャインスタンス rsrcmgr = PDFResourceManager() # 出力先インスタンス outfp = StringIO() # パラメータインスタンス laparams = LAParams() # 縦書き文字を横並びで出力する laparams.detect_vertical = True # デバイスの初期化 device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) # テキスト抽出インタプリタインスタンス interpreter = PDFPageInterpreter(rsrcmgr, device) # 対象ページを読み、テキスト抽出する。(maxpages:0は全ページ) pages = PDFPage.get_pages(fp, pagenos=None, maxpages=0, caching=True, check_extractable=True) for page in pages: interpreter.process_page(page) # 取得したテキストをすべて読みだす ret = outfp.getvalue() # 後始末をしておく fp.close() device.close() outfp.close() # 空白と改行をとりさり一塊のテキストとして返す # return re.sub(r"\s| ", '', ret) path = ABS_DIRNAME + '/mirai_output2(gettext).txt' with open(path, mode='wb') as f: f.write(ret.encode('cp932', 'ignore')) return ret
def getPdfAsText(pdfPages = None, fileDescriptor = None): if pdfPages is None and fileDescriptor is not None: pdfPages = getPdfPages(fileDescriptor) resourceManager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True try: outputStream = StringIO.StringIO() device = TextConverter(resourceManager, outputStream, laparams=laparams) intrepreter = PDFPageInterpreter(resourceManager, device) for pdfPage in pdfPages: intrepreter.process_page(pdfPage) return outputStream.getvalue() finally: device.close() outputStream.close()
def test(pdfpath, txtpath, buf=True): rsrcmgr = PDFResourceManager() outfp = StringIO() laparams = LAParams() laparams.detect_vertical = True device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) fp = open(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos=None, maxpages=0, caching=True, check_extractable=True): # maxpages:ページ指定(0は全ページ) interpreter.process_page(page) text = re.sub(re.compile(r"[ ]+"), "", outfp.getvalue()) fp.close() device.close() outfp.close() print(text) f = open(txtpath, 'w', encoding='utf-8') f.write(text) f.close()
def convert_pdf_to_txt(path, txtname, buf=True): rsrcmgr = PDFResourceManager() if buf: outfp = StringIO() else: outfp = open(txtname, 'w') codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() device.close() if buf: text = re.sub(space, "", outfp.getvalue()) print(text) outfp.close()
def gettext(pdfname): # PDFファイル名が未指定の場合は、空文字列を返して終了 if (pdfname == ''): return '' else: # 処理するPDFファイルを開く/開けなければ try: fp = open(pdfname, 'rb') except: return '' # リソースマネージャインスタンス rsrcmgr = PDFResourceManager() # 出力先インスタンス outfp = StringIO() # パラメータインスタンス laparams = LAParams() # 縦書き文字を横並びで出力する laparams.detect_vertical = True # デバイスの初期化 device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) # テキスト抽出インタプリタインスタンス interpreter = PDFPageInterpreter(rsrcmgr, device) # 対象ページを読み、テキスト抽出する。(maxpages:0は全ページ) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) #取得したテキストをすべて読みだす ret = outfp.getvalue() # 後始末をしておく fp.close() device.close() outfp.close() # 空白と改行をとりさり一塊のテキストとして返す return re.sub(r"\s| ", '', ret)
def convert_pdf_to_txt(path, txtname, buf=True): rsrcmgr = PDFResourceManager() if buf: outfp = StringIO() else: outfp = file(txtname, 'w') codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrcmgr, outfp, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() device.close() if buf: text = re.sub(space, "", outfp.getvalue()) print (text) outfp.close()
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 fstr = '' for page in PDFPage.get_pages(fp, maxpages=maxpages): interpreter.process_page(page) str = retstr.getvalue() fstr += str fp.close() device.close() retstr.close() return fstr
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # Trueにすることで綺麗にテキストを抽出できる device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) #openで対象のpdfを読み込む fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 #最大ページ数の指定 fstr = '' for page in PDFPage.get_pages(fp, maxpages=maxpages): #1ページ分の情報を取得する interpreter.process_page(page) # process_page()で1ページ分の情報をテキストに変換 str = retstr.getvalue() #StringIO オブジェクト内に格納されているテキスト情報を取得する。 fstr += str #fstr変数に取得したテキスト情報を追記していく fp.close() device.close() retstr.close() return fstr
def parse_pdf(path): fd = open(path, 'rb') retstr = StringIO() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True rmngr = PDFResourceManager(caching=True) device = MyTextConverter(rmngr, retstr, laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rmngr, device) for page in PDFPage.get_pages(fd, set(), check_extractable=True): interpreter.process_page(page) fulltext = (''.join(device.text_output)).strip() fd.close() if len(fulltext) == 0: return [] lines = fulltext.split("\n") return lines
def convert_pdf(target_fn): ''' Convert a pdf file into a string of text ''' laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True resource_manager = PDFResourceManager(caching=True) output_fh = StringIO.StringIO() device = TextConverter(resource_manager, output_fh, codec='utf-8', laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(resource_manager, device) with open(target_fn, 'rb') as f: for page in PDFPage.get_pages(f): interpreter.process_page(page) device.close() output_fh.seek(0) content = output_fh.read().decode('utf-8') return content
def pdf_to_txt(path: str) -> list: """ PDF ファイル読み込み、パースしてテキストを返す Args: path (str): PDF ファイルのパス Returns: list: PDF をパースしたテキストを改行で区切ったリスト """ resource_manager = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # Trueにすることで綺麗にテキストを抽出できる device = TextConverter(resource_manager, retstr, codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) maxpages = 0 caching = True pagenos = set() fstr = '' for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str break fp.close() device.close() retstr.close() list_text = fstr.split('\n') return list_text
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({ "index": page_index, "bounding_box": bounding_box, "labels": labels }) page_index += 1 fp.close() return result
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] ' '[-S] [-f] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True roundCoords = False simplifyOutput = False formatOutput = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) elif k == '-r': roundCoords = True elif k == '-S': simplifyOutput = True elif k == '-f': formatOutput = True PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if formatOutput and outtype.endswith('ml'): try: from cStringIO import StringIO except ImportError: from StringIO import StringIO outfp = StringIO() else: outfp = getRealOutput(outfile) if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, layoutmode=layoutmode, scale=scale, roundCoords=roundCoords, simplifyOutput=simplifyOutput) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if formatOutput: root = outfp.getvalue() with getRealOutput(outfile) as realOutput: try: from bs4 import BeautifulSoup as bs except ImportError: bs = None sys.stderr.write('Could not import BeautifulSoup, skipping output formatting') realOutput.write(root) else: soup = bs(root) prettyHTML = soup.prettify() realOutput.write(prettyHTML) outfp.close() return
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' # pageno = 1 scale = 1 caching = True # showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() bad_words = [ 'Personal', 'Information', 'Projects', 'Internship', 'Technologies' ] with open('cv.txt') as oldfile, open('cv_new.txt', 'w') as newfile: for line in oldfile: if not any(bad_word in line for bad_word in bad_words): newfile.write(line) file = open("cv_new.txt", "r") s = file.read() s = s.split('\n') while ("" in s): s.remove("") while (" " in s): s.remove(" ") while ("\x0c" in s): s.remove("\x0c") details = [] i = 0 while (i < len(s)): s1 = s[i].split(': ') if (len(s1) > 1): details.append(s1[1]) i += 1 sql = "INSERT INTO entries (name, post, exp) VALUES (%s, %s, %s)" val = (details[0], details[1], details[2]) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") return
def main(argv): import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用 print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') ''' getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] ) 短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数 长选项名后的等号(=)表示该选项必须有附加的参数。 返回opts和args。 ''' except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' #参数P pagenos = set() #参数p maxpages = 0 #参数m # output option outfile = None #参数o output outtype = None #参数t out type outdir = None #参数O output directory layoutmode = 'normal' #参数Y codec = 'utf-8' #参数c pageno = 1 scale = 1 #参数s,暂缺M,L,F,Y四个参数 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: #确认输出文件格式 outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数 elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default = 0, help='Debug (repeat for more verbose debugging)') parser.add_argument('-p', '--pages', dest='pagenos', action='store', type=str, default = '', help='Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.') parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument('-t', '--type', dest='outtype', action='store', type=str, default='shape', choices = ['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag') parser.add_argument('-m', dest='maxpages', action='store', type=int, default=0, help='Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.') parser.add_argument('-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument('-o', '--output', dest='outfile', action='store', type=str, default=None, help='Specifies the output file name. By default, it prints the extracted contents to stdout in text format.') parser.add_argument('-C', '--no-caching', dest='caching', action='store_false', default=True, help='Suppress object caching. This will reduce the memory consumption but also slows down the process.') parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help='Forces to perform layout analysis for all the text strings, including text contained in figures.') parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument('-M', dest='char_margin', action='store', type=float, default=2.0, help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.') parser.add_argument('-L', dest='line_margin', action='store', type=float, default=0.5, help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.') parser.add_argument('-W', dest='word_margin', action='store', type=float, default=0.1, help='It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.') parser.add_argument('-F', dest='boxes_flow', action='store', type=float, default=0.5, help='Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).') parser.add_argument('-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices = ['exact', 'normal', 'loose'], help='Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.') parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument('-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument('--draw-lines', dest='draw_lines', action='store_true', help="Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output.") parser.add_argument('--draw-boxes', dest='draw_boxes', action='store_true', help="Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output.") parser.add_argument('--draw-blocks', dest='draw_blocks', action='store_true', help="Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output.") parser.add_argument('--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help="If the text is sheared above this limit, reject it. Valid only for the `shape' output.") parser.add_argument('--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help="If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output.") parser.add_argument('--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help='Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).') parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument('--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help=r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help=r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help=r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help=r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-limit', dest='indent_limit', action='store', type=float, default=3, help='If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.') parser.add_argument('--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help=r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help='Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).') parser.add_argument('--print-stats', dest='print_stats', action='store_true', default=False, help='Instead of the text, output some simple statistics about the file.') parser.add_argument('--max-blocks', dest='max_blocks', action='store', default=0, type=int, help='If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.') parser.add_argument('--max-textlines', dest='max_textlines', action='store', default=0, type=int, help='If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.') parser.add_argument('--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices = ['bbox', 'mean', 'median'], help='Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.') parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update( int(x)-1 for x in args.pagenos.split(',') ) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i - 1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
# cmdにpip freezeと入力すると、入っているライブラリを返してくれる。 # pip freeze # pip install # PDFから全テキストを抽出する方法 # https://tech.bita.jp/article/18 for pdf in PDF_LIST ### おまじない ### print(pdf) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.detect_vertical = True # Trueにすることできれいにテキストを抽出できる。 device = TextConverter(rsrcmgr, retstr,codec=codec,laparams=laparams) ### おまじない ### fp = open(pdf,'rb') interpreter = PDFPageInterpreter(rsrcmgr,device) maxpage = 0 caching = True pagenos = set() fstr = '' for page in PDFage.get_pages(fp,pagenos,maxpages=maxpages,caching,check_extractable = True): interpreter.process_page(page) string = retstr.getvalue() fstr += string fp.close
def main(files=None): if files is None: files = get_datafiles() # debug option level debug = 0 # input option password = '' pagenos = set() # pagenos.update( int(x)-1 for x in v.split(',') ) maxpages = 0 # output option rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True rsrcmgr = PDFResourceManager(caching=caching) showpageno = True # Line Agumentation ? Parameters laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True laparams.line_overlap = 0.3 # Line overlap laparams.char_margin = 2.0 # Letter Spacing laparams.line_margin = 0.5 # Line Spacing laparams.word_margin = 0.1 # Word spacing laparams.boxes_flow = 0.5 # +-1.0 how much hor vs. vertical matters # position maters for line continuation # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # for fname in files: fname = str(fname) imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img')) # print(imagedir) imagewriter = None imagewriter = ImageWriter(imagedir) # output folder for images name = os.path.splitext(os.path.basename(fname))[0] print(name) outfile = fname[:-4] + '.txt' device = TextCon(rsrcmgr, laparams=laparams, imagewriter=imagewriter, imagename=name) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: continue rows = [list(row) for row in device.rows] pages = max([row[0] for row in rows]) max_y = max([row[4] for row in rows]) min_y = min([row[2] for row in rows]) list_0 = [int(row[4]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] max_y2 = max(list_1) list_0 = [int(row[2]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] min_y2 = min(list_1) print('max_ys:', max_y - max_y2) print('min_ys:', min_y - min_y2) # Get max and min the hard way because of stupid headers list_0 = [int(row[3]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: max_x = max(list_1) else: max_x = max([int(row[3]) for row in device.rows]) list_0 = [int(row[1]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: min_x = min(list_1) else: min_x = min([int(row[3]) for row in device.rows]) # Errors if more pics on one side then other # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in # device.rows])/len(device.rows)) mid_x = (max_x + min_x) / 2 # mid_x = 595/2 # center of A4 at 72px/in Letter would be 612/2 l_height = sum([row[4] - row[2] for row in rows]) / len(rows) # print('max_x:', max_x) # print('min_x:', min_x) # print('mid_x:', mid_x) print('l_height:', l_height) column2 = [] lines = [] pagenumber = 0 table_caps = ['\n'] table_data = [] table = False for i, row in enumerate(rows): #l_height = row[4]-row[2] l_space = rows[i - 1][2] - row[4] #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5])) if row[0] == pagenumber + 1: lines += column2 column2 = [] pagenumber += 1 if row[0] == pagenumber: if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height: # capture Table (assuming tables will span all columns) if re.match(r"^table", str(row[5]), re.I): table = True table_caps.append(str(row[5])) table_data.append('\n') table_data.append(str(row[5])) table_data.append('\n') continue else: table = False # capture table captions multi lines elif (table_caps[-1] == str(rows[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): table_caps[-1] += str(row[5]) table_data[-2] += str(row[5]) continue if table: # capture table data if int(rows[i - 1][2]) == int(rows[i][2]): table_data[-1] += '\t' + str(row[5]) continue else: table_data.append(str(row[5])) continue elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and int(rows[i - 1][3]) < mid_x) or (int(rows[i - 1][1]) > mid_x and int(rows[i - 1][3]) > mid_x) or rows[i - 1][3] > max_x * 0.9 or l_space > 2.5 * l_height): """ r_space > c_space or previous[3] > max_x * 0.9 or l_space > 2 * l_height):""" if len(column2) > 0: if 1 > (row[2] - column2[-1][2]) > -1: # join if on same line if int(row[1]) < int(column2[-1][1]): column2[-1][5] = row[5] + " " + column2[-1][5] else: column2[-1][5] = column2[-1][5] + " " + row[5] else: column2.append(row) else: column2.append(row) # print(2, str(row[5])) else: if len(lines) > 0: if 1 > (row[2] - lines[-1][2]) > -1: # join if on same line if int(row[1]) < int(lines[-1][1]): lines[-1][5] = row[5] + " " + lines[-1][5] else: lines[-1][5] = lines[-1][5] + " " + row[5] else: lines.append(row) else: lines.append(row) # print(3, str(row[5])) # add final column lines += column2 fig_caps = ['\n'] headers = ['\n'] footers = ['\n'] supp_info = ['\n'] new_lines = [] supp_re = re.compile( r"Corresponding author|Electronic mail|email" "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I) for i, line in enumerate(lines): #l_height = lines[i][4]-lines[i][2] l_space = lines[i - 1][2] - lines[i][4] l_space_below = 0 l_space_2below = 0 if i + 1 < len(lines): l_space_below = lines[i][2] - lines[i + 1][4] if i + 2 < len(lines): l_space_2below = lines[i + 1][2] - lines[i + 2][4] fig = fig_caps[-1] print(l_space, l_space_below, l_space_2below, lines[i][2], lines[i][4], str(line[5])) # capture figure captions multi lines if (fig_caps[-1] == str(lines[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): fig_caps.append(str(line[5])) continue # capture headers (up to two lines) if (lines[i][2] > max_y * 0.95 and (l_space_below > 0.5 * l_height or l_space_2below > 0.5 * l_height)): headers.append('\n') headers.append(str(line[5])) if supp_re.search(str(line[5])): headers.append('\n') headers.append(str(line[5])) else: continue # capture supporting info if supp_re.search(str(line[5])): print(str(line[5])) supp_info.append('\n') supp_info.append(str(line[5])) continue if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height: # capture figure captions if re.match(r"^fig", str(line[5]), re.I): fig_caps.append('\n') fig_caps.append(str(line[5])) continue # capture footers elif lines[i][2] < min_y + max_y * 0.015: footers.append('\n') footers.append(str(line[5])) continue else: string = str(lines[i - 1][5]) if (any(string in s for s in fig_caps) or any(string in s for s in headers)): # or #string == footers[-1] or string == supp_info[-1]): pass else: new_lines.append('\n') new_lines.append(str(line[5])) with open(outfile, 'w') as f: f.write(' '.join(new_lines)) f.write('\n\nFigures') f.write(' '.join(fig_caps)) f.write('\n\nTables') #f.write(' '.join(table_caps)) f.write('\n'.join(table_data)) f.write('\n\nHeaders') f.write(' '.join(headers)) f.write('\n\nFooters') f.write(' '.join(footers)) f.write('\n\nSupporting Info') f.write(' '.join(supp_info)) # the histogram of the data # n, bins, patches = plt.hist(x_data, 50) # plt.show() device.close() print('Done') return
def pdfminerr(argv): global pdfminerr, install import getopt def usage(): print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.") return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(fname, k, v): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def convert_pdf_To_Txt(path,opts={}): """ this ALGO form pdfinterp modul documentation """ # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout retstr = StringIO() if outtype == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #print retstr.getvalue() txt2Pdf=retstr.getvalue() #print type(txt2Pdf) #fp.close() #device.close() #outfp.close() return txt2Pdf
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO rsrcmgr = PDFResourceManager() rettxt = StringIO() laparams = LAParams() # 縦書き文字を横並びで出力する laparams.detect_vertical = True device = TextConverter(rsrcmgr, rettxt, codec='utf-8', laparams=laparams) # 処理するPDFを開く fp = open('calender.pdf', 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) # maxpages:ページ指定(0は全ページ) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) print(rettxt.getvalue()) fp.close() device.close() rettxt.close()
def get_text_from_pdf(pdfname, limit=1000): # PDFファイル名が未指定の場合は、空文字列を返して終了 if (pdfname == ''): return '' else: # 処理するPDFファイルを開く/開けなければ try: fp = open(pdfname, 'rb') except: return '' # PDFからテキストの抽出 rsrcmgr = PDFResourceManager() out_fp = StringIO() la_params = LAParams() la_params.detect_vertical = True device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) text = out_fp.getvalue() fp.close() device.close() out_fp.close() # 改行で分割する lines = text.splitlines() outputs = [] output = "" # 除去するutf8文字 replace_strs = [b'\x00'] is_blank_line = False # 分割した行でループ for line in lines: # byte文字列に変換 line_utf8 = line.encode('utf-8') # 余分な文字を除去する for replace_str in replace_strs: line_utf8 = line_utf8.replace(replace_str, b'') # strに戻す line = line_utf8.decode() # 連続する空白を一つにする line = re.sub("[ ]+", " ", line) # 前後の空白を除く line = line.strip() #print("aft:[" + line + "]") # 空行は無視 if len(line) == 0: is_blank_line = True continue # 数字だけの行は無視 if is_float(line): continue # 1単語しかなく、末尾がピリオドで終わらないものは無視 if line.split(" ").count == 1 and not line.endswith("."): continue # 文章の切れ目の場合 if is_blank_line or output.endswith("."): # 文字数がlimitを超えていたらここで一旦区切る if (len(output) > limit): outputs.append(output) output = "" else: output += "\r\n" #前の行からの続きの場合 elif not is_blank_line and output.endswith("-"): output = output[:-1] #それ以外の場合は、単語の切れ目として半角空白を入れる else: output += " " #print("[" + str(line) + "]") output += str(line) is_blank_line = False outputs.append(output) return outputs
def main(argv): def usage(): print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]" " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]" " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]" " [-t text|html|xml|tag] [-c codec] [-s scale]" " file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # input option password = b"" pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": logging.getLogger().setLevel(logging.DEBUG) elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": imagewriter = ImageWriter(v) elif k == "-R": rotation = int(v) elif k == "-S": stripcontrol = True elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = open(outfile, "wb") else: outfp = sys.stdout if outfp.encoding is not None: codec = None if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol ) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = open(fname, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def __init__(self, filename): laparams = LAParams() laparams.detect_vertical = True super().__init__(filename, laparams)
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] " "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] " "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = "" pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": debug += 1 elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": outdir = v elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # # PDFDocument.debug = debug # PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = file(outfile, "w") else: outfp = sys.stdout if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == "xml": device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, "rb") process_pdf( rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) fp.close() device.close() outfp.close() return
def document (self): def mergeSameParagraphLines (lines): def isEndOfParagraph (line): return line[-1:] in ['.', '?', '!'] or len(line) < 60 result = [] currentLine = '' for line in lines: # print "# '" + line + "'" currentLine += line if isEndOfParagraph(line): result.append(currentLine) currentLine = '' if currentLine != '': result.append(currentLine) return result if not self._document: pdfFile = open(self._pdfDocument, 'rb') pdfParser = PDFParser(pdfFile) document = PDFDocument() pdfParser.set_document(document) document.set_parser(pdfParser) document.initialize() if not document.is_extractable: raise pdfminer.pdfparser.PDFTextExtractionNotAllowed resourceManger = PDFResourceManager() debug = 1 # PDFDocument.debug = debug PDFParser.debug = debug # CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # pdfContent = StringIO() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True # laparams.line_margin = 1.0 # laparams.char_margin = 1.0 # laparams.word_margin = 1.0 # laparams.boxes_flow = 1.0 # device = PDFDevice(resourceManger) device = TextConverter(resourceManger, pdfContent, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(resourceManger, device) for page in document.get_pages(): interpreter.process_page(page) content = mergeSameParagraphLines(pdfContent.getvalue().split('\n')) toc = [] try: for (level, title, destination, a, se) in document.get_outlines(): toc.append((level, title)) except: pass pdfContent.close() self._document = Document().initWithDocumentInfo(content, None, None) return self._document
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content