def parse_data(self): rsrcmgr = PDFResourceManager() text_stream = StringIO() codec = 'utf-8' laparams = LAParams() laparams.boxes_flow = 0.5 device = TextConverter(rsrcmgr, text_stream, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() self.pdf_page_count = 0 for page in PDFPage.get_pages(self.fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): self.pdf_page_count += 1 interpreter.process_page(page) self.file_name = self.fp.name self.fp.close() self.pdf_extracted_text = text_stream.getvalue() text_stream.close() device.close() return PDFFile(self.file_name, self.pdf_extracted_text, self.pdf_page_count)
def convert_pdf_to_txt(fp): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.boxes_flow = 0.5 device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5): """ Converts a pdf filename into plain text. Each value is specified not as an actual length, but as a proportion of the length to the size of each character in question. Parameters define layout analysis. In a PDF text is in several chunks of various types. Text extraction needs to recover text chunks which ar regarded as continuous if elements distance is closer than the char_margin (identified as M) and thus are grouped into one block. Two lines are part of the same text if they are closer than the line_margin (L). If the distance between two words is greater than the word_margin (W), blank characters (spaces) shall be inserted as necessary to keep format. Boxes flow (F) specifies how much a horizontal and vertical position of a text matters when determining text flow order. The value should be within the range from -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). Keyword arguments: fname -- PDF file name (string) pages -- Set of pages to extract (set) M -- char_margin (float) L -- line_margin (float) W -- word_margin (float) F -- boxes_flow (float) Return: text: pdf contents as plain text """ if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() codec = "utf-8" manager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = False laparams.char_margin = M laparams.line_margin = L laparams.word_margin = W laparams.boxes_flow = F converter = TextConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def GetScript(filename): global scriptName ResetGlobals() scriptName = filename password = "" # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "---Not translatable---" return #raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.boxes_flow = 2 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum,page in enumerate(PDFPage.create_pages(document)): if pgnum == 0: continue interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() text = [] for page in layout: try: if page.get_text().strip(): text.append(TextBlock(page.x0,page.y1,page.get_text().strip())) except: temp=5 print ".", text.sort(key = lambda row:(-row.y)) # Parse all of the "line" objects in each page for line in text: ParseLine(line.text, line.x)
def crawl(self): url = 'http://www.khsusti.cz/php/kousky/covid19/pocet_testovanych_osob_na_covid19_ustecky_kraj.pdf' pocet_okresu = 7 results=[] laparams=LAParams() laparams.boxes_flow=None lines = [line for line in utils.get_pdfminer(url, laparams) if len(line.replace(' ', '')) > 0] start_index = None distance_to_counts = None for i, line in enumerate(lines): if line.startswith('Děčín'): start_index = i break for i in range(start_index, start_index + pocet_okresu * 3, 3): value = int(lines[i+1].strip()) name = lines[i].strip() results.append({'okres': name, 'kraj': self.kraj, 'hodnota': value}) return results
def _get_content(fname): rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams() laparams.line_margin = 1.0 laparams.boxes_flow = 1.0 imagewriter = None with io.BytesIO() as outfp: device = TextConverter( rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter ) interpreter = PDFPageInterpreter(rsrcmgr, device) with open(fname, "rb") as f: for page in PDFPage.get_pages( f, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ): interpreter.process_page(page) return outfp.getvalue().decode("utf-8")
def from_pdf(pdfFile): try: pagenos = set() strfp = StringIO() codec = 'utf-8' laparams = LAParams() #laparams.char_margin = 10 laparams.line_margin = 20 #laparams.word_margin = 10 laparams.boxes_flow = -1 rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, strfp, codec=codec, laparams=laparams) fp = file(pdfFile, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, check_extractable=True): interpreter.process_page(page) except Exception, e: print e traceback.print_exc() pass
def pdfminerr(argv): global pdfminerr, install import getopt def usage(): print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.") return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def convert_pdf_to_text(self): """ PDFファイルをテキストに変換 PDFは2段に段組みされたものも含む """ laparams = LAParams() # パラメータインスタンス laparams.boxes_flow = None # -1.0(水平位置のみが重要)から+1.0(垂直位置のみが重要)default 0.5 laparams.word_margin = 0.2 # default 0.1 laparams.char_margin = 2.0 # default 2.0 laparams.line_margin = 0 # default 0.5 # 出力ファイルのオープン ファイルがある時は上書きされる with open(self.output_path, "w", encoding="utf-8") as f: # 初期化 self.text_l = "" # 左側の文字列 self.text_r = "" # 右側の文字列 print("Analyzing from {} page to {} page(0:to last)".format( self.start_page, self.last_page)) # 対象ページを読み、テキスト抽出する。(maxpages:0は全ページ) for page_layout in extract_pages( self.input_path, maxpages=0, laparams=laparams): # ファイルにwithしている # 抽出するページの選別。extract_pagesの引数では、開始ページだけの指定に対応できないため if page_layout.pageid < self.start_page: continue # 指定開始ページより前は飛ばす if self.last_page and self.last_page < page_layout.pageid: break # 指定終了ページ以降は中断 # ページの幅から段組みの境界を計算(用紙幅の半分とする) if self.border == 0: self.border = int(page_layout.width / 2) if page_layout.pageid == self.start_page: print("Check on page #{}".format(page_layout.pageid)) print("Page Info width:{}, heght:{}".format( page_layout.width, page_layout.height)) print("Calc result border: {}, footer: {}".format( self.border, self.footer)) # 要素の出現順の確認(debug) # for element in self.flatten_lttext(page_layout, LTTextBox): # print("bbox{} {}".format(element.bbox, element.get_text()[:20])) # 要素のイテレータをたどり入れ子の要素を1次元に取り出す。戻るイテレータはLTTextBox型のみ # 要素の行の上側y1で降順、行の左側x0で昇順にソートする。 for element in sorted(self.flatten_lttext( page_layout, LTTextBox), key=lambda x: (-x.y1, x.x0)): # for element in self.flatten_lttext(page_layout, LTTextBox): if element.y1 < self.footer: continue # フッター位置の文字は抽出しない if element.y0 > self.header: continue # ヘッダー位置の文字は抽出しない _text = element.get_text() # debug # print("y1:{}, y0:{}■{}".format(element.y1, element.y0, _text)) if element.x1 < self.border: # 文字列全体が左側 self.text_l += _text else: if element.x0 >= self.border: # 文字列全体が右側 self.text_r += _text else: # 文字列が境界をまたいでいる場合 # 右側に既に文章があれば先に出力する if self.text_r: self.write2text(f) self.text_l += _text # 1ページ分処理したら書き込む self.write2text(f)
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: if sys.platform == 'linux': outfp = open(outfile, 'w', encoding=encoding) elif sys.platform == 'win32': outfp = open(outfile, 'wb') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def convert_pdf_To_Txt(path,opts={}): """ this ALGO form pdfinterp modul documentation """ # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout retstr = StringIO() if outtype == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #print retstr.getvalue() txt2Pdf=retstr.getvalue() #print type(txt2Pdf) #fp.close() #device.close() #outfp.close() return txt2Pdf
def main(files=None): if files is None: files = get_datafiles() # debug option level debug = 0 # input option password = '' pagenos = set() # pagenos.update( int(x)-1 for x in v.split(',') ) maxpages = 0 # output option rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True rsrcmgr = PDFResourceManager(caching=caching) showpageno = True # Line Agumentation ? Parameters laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True laparams.line_overlap = 0.3 # Line overlap laparams.char_margin = 2.0 # Letter Spacing laparams.line_margin = 0.5 # Line Spacing laparams.word_margin = 0.1 # Word spacing laparams.boxes_flow = 0.5 # +-1.0 how much hor vs. vertical matters # position maters for line continuation # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # for fname in files: fname = str(fname) imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img')) # print(imagedir) imagewriter = None imagewriter = ImageWriter(imagedir) # output folder for images name = os.path.splitext(os.path.basename(fname))[0] print(name) outfile = fname[:-4] + '.txt' device = TextCon(rsrcmgr, laparams=laparams, imagewriter=imagewriter, imagename=name) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: continue rows = [list(row) for row in device.rows] pages = max([row[0] for row in rows]) max_y = max([row[4] for row in rows]) min_y = min([row[2] for row in rows]) list_0 = [int(row[4]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] max_y2 = max(list_1) list_0 = [int(row[2]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] min_y2 = min(list_1) print('max_ys:', max_y - max_y2) print('min_ys:', min_y - min_y2) # Get max and min the hard way because of stupid headers list_0 = [int(row[3]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: max_x = max(list_1) else: max_x = max([int(row[3]) for row in device.rows]) list_0 = [int(row[1]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: min_x = min(list_1) else: min_x = min([int(row[3]) for row in device.rows]) # Errors if more pics on one side then other # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in # device.rows])/len(device.rows)) mid_x = (max_x + min_x) / 2 # mid_x = 595/2 # center of A4 at 72px/in Letter would be 612/2 l_height = sum([row[4] - row[2] for row in rows]) / len(rows) # print('max_x:', max_x) # print('min_x:', min_x) # print('mid_x:', mid_x) print('l_height:', l_height) column2 = [] lines = [] pagenumber = 0 table_caps = ['\n'] table_data = [] table = False for i, row in enumerate(rows): #l_height = row[4]-row[2] l_space = rows[i - 1][2] - row[4] #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5])) if row[0] == pagenumber + 1: lines += column2 column2 = [] pagenumber += 1 if row[0] == pagenumber: if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height: # capture Table (assuming tables will span all columns) if re.match(r"^table", str(row[5]), re.I): table = True table_caps.append(str(row[5])) table_data.append('\n') table_data.append(str(row[5])) table_data.append('\n') continue else: table = False # capture table captions multi lines elif (table_caps[-1] == str(rows[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): table_caps[-1] += str(row[5]) table_data[-2] += str(row[5]) continue if table: # capture table data if int(rows[i - 1][2]) == int(rows[i][2]): table_data[-1] += '\t' + str(row[5]) continue else: table_data.append(str(row[5])) continue elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and int(rows[i - 1][3]) < mid_x) or (int(rows[i - 1][1]) > mid_x and int(rows[i - 1][3]) > mid_x) or rows[i - 1][3] > max_x * 0.9 or l_space > 2.5 * l_height): """ r_space > c_space or previous[3] > max_x * 0.9 or l_space > 2 * l_height):""" if len(column2) > 0: if 1 > (row[2] - column2[-1][2]) > -1: # join if on same line if int(row[1]) < int(column2[-1][1]): column2[-1][5] = row[5] + " " + column2[-1][5] else: column2[-1][5] = column2[-1][5] + " " + row[5] else: column2.append(row) else: column2.append(row) # print(2, str(row[5])) else: if len(lines) > 0: if 1 > (row[2] - lines[-1][2]) > -1: # join if on same line if int(row[1]) < int(lines[-1][1]): lines[-1][5] = row[5] + " " + lines[-1][5] else: lines[-1][5] = lines[-1][5] + " " + row[5] else: lines.append(row) else: lines.append(row) # print(3, str(row[5])) # add final column lines += column2 fig_caps = ['\n'] headers = ['\n'] footers = ['\n'] supp_info = ['\n'] new_lines = [] supp_re = re.compile( r"Corresponding author|Electronic mail|email" "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I) for i, line in enumerate(lines): #l_height = lines[i][4]-lines[i][2] l_space = lines[i - 1][2] - lines[i][4] l_space_below = 0 l_space_2below = 0 if i + 1 < len(lines): l_space_below = lines[i][2] - lines[i + 1][4] if i + 2 < len(lines): l_space_2below = lines[i + 1][2] - lines[i + 2][4] fig = fig_caps[-1] print(l_space, l_space_below, l_space_2below, lines[i][2], lines[i][4], str(line[5])) # capture figure captions multi lines if (fig_caps[-1] == str(lines[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): fig_caps.append(str(line[5])) continue # capture headers (up to two lines) if (lines[i][2] > max_y * 0.95 and (l_space_below > 0.5 * l_height or l_space_2below > 0.5 * l_height)): headers.append('\n') headers.append(str(line[5])) if supp_re.search(str(line[5])): headers.append('\n') headers.append(str(line[5])) else: continue # capture supporting info if supp_re.search(str(line[5])): print(str(line[5])) supp_info.append('\n') supp_info.append(str(line[5])) continue if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height: # capture figure captions if re.match(r"^fig", str(line[5]), re.I): fig_caps.append('\n') fig_caps.append(str(line[5])) continue # capture footers elif lines[i][2] < min_y + max_y * 0.015: footers.append('\n') footers.append(str(line[5])) continue else: string = str(lines[i - 1][5]) if (any(string in s for s in fig_caps) or any(string in s for s in headers)): # or #string == footers[-1] or string == supp_info[-1]): pass else: new_lines.append('\n') new_lines.append(str(line[5])) with open(outfile, 'w') as f: f.write(' '.join(new_lines)) f.write('\n\nFigures') f.write(' '.join(fig_caps)) f.write('\n\nTables') #f.write(' '.join(table_caps)) f.write('\n'.join(table_data)) f.write('\n\nHeaders') f.write(' '.join(headers)) f.write('\n\nFooters') f.write(' '.join(footers)) f.write('\n\nSupporting Info') f.write(' '.join(supp_info)) # the histogram of the data # n, bins, patches = plt.hist(x_data, 50) # plt.show() device.close() print('Done') return
def ConvertPdf(pdfpath, outfp, opts={}): import sys from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() return True
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] " "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] " "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = "" pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": debug += 1 elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": outdir = v elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # # PDFDocument.debug = debug # PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = file(outfile, "w") else: outfp = sys.stdout if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == "xml": device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, "rb") process_pdf( rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' # pageno = 1 scale = 1 caching = True # showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() bad_words = [ 'Personal', 'Information', 'Projects', 'Internship', 'Technologies' ] with open('cv.txt') as oldfile, open('cv_new.txt', 'w') as newfile: for line in oldfile: if not any(bad_word in line for bad_word in bad_words): newfile.write(line) file = open("cv_new.txt", "r") s = file.read() s = s.split('\n') while ("" in s): s.remove("") while (" " in s): s.remove(" ") while ("\x0c" in s): s.remove("\x0c") details = [] i = 0 while (i < len(s)): s1 = s[i].split(': ') if (len(s1) > 1): details.append(s1[1]) i += 1 sql = "INSERT INTO entries (name, post, exp) VALUES (%s, %s, %s)" val = (details[0], details[1], details[2]) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") return
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content
def main(argv): def usage(): print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default = 0, help='Debug (repeat for more verbose debugging)') parser.add_argument('-p', '--pages', dest='pagenos', action='store', type=str, default = '', help='Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.') parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument('-t', '--type', dest='outtype', action='store', type=str, default='shape', choices = ['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag') parser.add_argument('-m', dest='maxpages', action='store', type=int, default=0, help='Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.') parser.add_argument('-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument('-o', '--output', dest='outfile', action='store', type=str, default=None, help='Specifies the output file name. By default, it prints the extracted contents to stdout in text format.') parser.add_argument('-C', '--no-caching', dest='caching', action='store_false', default=True, help='Suppress object caching. This will reduce the memory consumption but also slows down the process.') parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help='Forces to perform layout analysis for all the text strings, including text contained in figures.') parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument('-M', dest='char_margin', action='store', type=float, default=2.0, help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.') parser.add_argument('-L', dest='line_margin', action='store', type=float, default=0.5, help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.') parser.add_argument('-W', dest='word_margin', action='store', type=float, default=0.1, help='It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.') parser.add_argument('-F', dest='boxes_flow', action='store', type=float, default=0.5, help='Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).') parser.add_argument('-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices = ['exact', 'normal', 'loose'], help='Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.') parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument('-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument('--draw-lines', dest='draw_lines', action='store_true', help="Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output.") parser.add_argument('--draw-boxes', dest='draw_boxes', action='store_true', help="Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output.") parser.add_argument('--draw-blocks', dest='draw_blocks', action='store_true', help="Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output.") parser.add_argument('--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help="If the text is sheared above this limit, reject it. Valid only for the `shape' output.") parser.add_argument('--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help="If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output.") parser.add_argument('--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help='Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).') parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument('--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help=r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help=r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help=r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help=r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-limit', dest='indent_limit', action='store', type=float, default=3, help='If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.') parser.add_argument('--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help=r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help='Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).') parser.add_argument('--print-stats', dest='print_stats', action='store_true', default=False, help='Instead of the text, output some simple statistics about the file.') parser.add_argument('--max-blocks', dest='max_blocks', action='store', default=0, type=int, help='If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.') parser.add_argument('--max-textlines', dest='max_textlines', action='store', default=0, type=int, help='If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.') parser.add_argument('--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices = ['bbox', 'mean', 'median'], help='Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.') parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update( int(x)-1 for x in args.pagenos.split(',') ) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default=0, help='Debug (repeat for more verbose debugging)') parser.add_argument( '-p', '--pages', dest='pagenos', action='store', type=str, default='', help= 'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.' ) parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument( '-t', '--type', dest='outtype', action='store', type=str, default='shape', choices=['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag' ) parser.add_argument( '-m', dest='maxpages', action='store', type=int, default=0, help= 'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.' ) parser.add_argument( '-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument( '-o', '--output', dest='outfile', action='store', type=str, default=None, help= 'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.' ) parser.add_argument( '-C', '--no-caching', dest='caching', action='store_false', default=True, help= 'Suppress object caching. This will reduce the memory consumption but also slows down the process.' ) parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument( '-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help= 'Forces to perform layout analysis for all the text strings, including text contained in figures.' ) parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument( '-M', dest='char_margin', action='store', type=float, default=2.0, help= 'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.' ) parser.add_argument( '-L', dest='line_margin', action='store', type=float, default=0.5, help= 'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.' ) parser.add_argument( '-W', dest='word_margin', action='store', type=float, default=0.1, help= 'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.' ) parser.add_argument( '-F', dest='boxes_flow', action='store', type=float, default=0.5, help= 'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).' ) parser.add_argument( '-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices=['exact', 'normal', 'loose'], help= 'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.' ) parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument( '-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument( '--draw-lines', dest='draw_lines', action='store_true', help= "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output." ) parser.add_argument( '--draw-boxes', dest='draw_boxes', action='store_true', help= "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output." ) parser.add_argument( '--draw-blocks', dest='draw_blocks', action='store_true', help= "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output." ) parser.add_argument( '--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help= "If the text is sheared above this limit, reject it. Valid only for the `shape' output." ) parser.add_argument( '--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help= "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output." ) parser.add_argument( '--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help= 'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).' ) parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument( '--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help= r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help= r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help= r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help= r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-limit', dest='indent_limit', action='store', type=float, default=3, help= 'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.' ) parser.add_argument( '--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help= r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help= 'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).' ) parser.add_argument( '--print-stats', dest='print_stats', action='store_true', default=False, help= 'Instead of the text, output some simple statistics about the file.') parser.add_argument( '--max-blocks', dest='max_blocks', action='store', default=0, type=int, help= 'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.' ) parser.add_argument( '--max-textlines', dest='max_textlines', action='store', default=0, type=int, help= 'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.' ) parser.add_argument( '--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices=['bbox', 'mean', 'median'], help= 'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.' ) parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update(int(x) - 1 for x in args.pagenos.split(',')) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return
def main(argv): import getopt def usage(): print ('usage: %s [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() pages_text = [] for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # # PDFDocument.debug = debug # PDFParser.debug = debug # CMapDB.debug = debug # PDFPageInterpreter.debug = debug # retstr = io.StringIO() rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, retstr, laparams=laparams,imagewriter=imagewriter) data = [] for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) data = retstr.getvalue() #print(data) data=data.replace("\xa0", "") data=data.replace("\uf0da", "") data=data.replace("\x0c", "") data=data.replace("• ", "") data=data.replace("* ", "") data=data.replace("(LinkedIn)", "") data=data.replace(" (LinkedIn)", "") data=data.replace("\uf0a7", "") data=data.replace("(Mobile)", "") data=data.replace("- ", "") result_list=data.split('\n') #print(result_list) skills=[] languages=[] summary=[] certifications=[] contact=[] linkedin=[] experience=[] education=[] exp_dict={} edu_dict={} for i in result_list: if i=='Contact': value=result_list.index(i) while True: contact.append(result_list[value].strip()) value=value+1 if result_list[value] =='': break if i.__contains__('www.linkedin.com'): value=result_list.index(i) while True: linkedin.append(result_list[value]) value=value+1 if result_list[value] =='': break if len(linkedin)>=2: ln=[] merged=linkedin[0]+linkedin[1].strip() ln.append(merged) linkedin=ln if i=='Top Skills': value=result_list.index(i) while True: skills.append(result_list[value]) value=value+1 if result_list[value] =='': break if i.__contains__('Certifications'): value=result_list.index(i) while True: certifications.append(result_list[value]) value=value+1 if result_list[value] =='': break if i.__contains__('Summary'): value=result_list.index(i) while True: summary.append(result_list[value]) value=value+1 if result_list[value] =='': break if i=='Languages': value=result_list.index(i) while True: languages.append(result_list[value]) value=value+1 if result_list[value] =='': break if i=='Experience': value=result_list.index(i) value=value+2 while True: experience.append(result_list[value]) value=value+1 a=str(result_list[value]) if a.__contains__('-'): k=a.split('-') print('start:',k[0],'end:',k[1]) break elif result_list[value] =='': break listOfExp = ["company", "position","period","place","description" ] zipbObj = zip(listOfExp, experience) exp_dict = dict(zipbObj) if i=='Education': value=result_list.index(i) value=value+1 while True: education.append(result_list[value]) value=value+1 # a=str(result_list[value]) # if a.__contains__('-'): # k=a.split('-') # print('start:',k[0],'end:',k[1]) # break if result_list[value] =='': break listOfEdu = ["school", "degree" ] zipbObj = zip(listOfEdu, education) edu_dict = dict(zipbObj) # print(languages) print('###############') print(contact,'\n',linkedin,'\n',summary,'\n',skills,'\n',certifications,'\n',languages,'\n',exp_dict,'\n',edu_dict) #print(data.splitlines()) device.close() retstr.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用 print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') ''' getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] ) 短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数 长选项名后的等号(=)表示该选项必须有附加的参数。 返回opts和args。 ''' except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' #参数P pagenos = set() #参数p maxpages = 0 #参数m # output option outfile = None #参数o output outtype = None #参数t out type outdir = None #参数O output directory layoutmode = 'normal' #参数Y codec = 'utf-8' #参数c pageno = 1 scale = 1 #参数s,暂缺M,L,F,Y四个参数 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: #确认输出文件格式 outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数 elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
import sys import io from string import punctuation import re # pdf path document = open('C:/Users/Vincent/Dropbox/writeProgram/python/20180509.pdf', 'rb') #crate pdf manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() laparams.char_margin = 3.2 #5 4best laparams.line_margin = 8 #5-8 laparams.word_margin = 10 laparams.boxes_flow = 0.5 # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=laparams) #create pdf interpreter interpreter = PDFPageInterpreter(rsrcmgr , device) checkPoint = 0 #初始狀態 檢查各個段落 partOne = "General Information" partTwo = "Deal History" partThree = "Investors (" # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, # LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for page in PDFPage.get_pages(document): interpreter.process_page(page) layout = device.get_result() for obj in layout:
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
import lxml.etree as etree import re from xml.dom import minidom from xml.etree import ElementTree as ET import pandas as pd from bs4 import BeautifulSoup def convert(case, pdfpath, targetfilepath, pages=100): if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True laparams = LAParams(all_texts=True) laparams.boxes_flow = -0.5 # laparams.paragraph_indent = 0.2 laparams.detect_vertical = True # laparams.heuristic_word_margin = 0.03 laparams.line_overlap = 0.2 laparams.word_margin = 0.2 laparams.line_margin = 0.5 laparams.char_margin = 1000.0 if case == 'text': output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) if case == 'HTML':
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i - 1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]" " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]" " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]" " [-t text|html|xml|tag] [-c codec] [-s scale]" " file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # input option password = b"" pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": logging.getLogger().setLevel(logging.DEBUG) elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": imagewriter = ImageWriter(v) elif k == "-R": rotation = int(v) elif k == "-S": stripcontrol = True elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = open(outfile, "wb") else: outfp = sys.stdout if outfp.encoding is not None: codec = None if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol ) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = open(fname, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(fname, k, v): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] ' '[-S] [-f] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True roundCoords = False simplifyOutput = False formatOutput = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) elif k == '-r': roundCoords = True elif k == '-S': simplifyOutput = True elif k == '-f': formatOutput = True PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if formatOutput and outtype.endswith('ml'): try: from cStringIO import StringIO except ImportError: from StringIO import StringIO outfp = StringIO() else: outfp = getRealOutput(outfile) if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, layoutmode=layoutmode, scale=scale, roundCoords=roundCoords, simplifyOutput=simplifyOutput) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if formatOutput: root = outfp.getvalue() with getRealOutput(outfile) as realOutput: try: from bs4 import BeautifulSoup as bs except ImportError: bs = None sys.stderr.write('Could not import BeautifulSoup, skipping output formatting') realOutput.write(root) else: soup = bs(root) prettyHTML = soup.prettify() realOutput.write(prettyHTML) outfp.close() return