def get_text_rows(path): rows = defaultdict(list) # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. # parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter # document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. # if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() laparams.line_overlap = 0.01 laparams.line_margin = 0.01 laparams.word_margin = 0.15 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs, page): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): rows[(page, -int(obj.bbox[1]))].append( (int(obj.bbox[0]), sanitize(obj.get_text()))) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs, page) # loop over all pages in the document for page_num, page in enumerate(PDFPage.get_pages(fp)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs, page_num) for key in sorted(rows): rows[key] = sorted(rows[key]) page, y = key y = -y yield (page, y, rows[key])
def extract_layout_by_page(pdf_path, page_number): """ :param pdf_path: pdf file path :param page_number: the specific page that you want to parse(start from 1) :return: a list of pdfminer layout object """ fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() laparams.line_overlap = 0.3 laparams.char_margin = 3 laparams.word_margin = 0.3 laparams.line_margin = 0.01 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] # 循环遍历列表,每次处理一个page的内容 pages = list(doc.get_pages()) interpreter.process_page(pages[page_number - 1]) # 接受该页面的LTPage对象 return device.get_result()
def main(files=None): if files is None: files = get_datafiles() # debug option level debug = 0 # input option password = '' pagenos = set() # pagenos.update( int(x)-1 for x in v.split(',') ) maxpages = 0 # output option rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True rsrcmgr = PDFResourceManager(caching=caching) showpageno = True # Line Agumentation ? Parameters laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True laparams.line_overlap = 0.3 # Line overlap laparams.char_margin = 2.0 # Letter Spacing laparams.line_margin = 0.5 # Line Spacing laparams.word_margin = 0.1 # Word spacing laparams.boxes_flow = 0.5 # +-1.0 how much hor vs. vertical matters # position maters for line continuation # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # for fname in files: fname = str(fname) imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img')) # print(imagedir) imagewriter = None imagewriter = ImageWriter(imagedir) # output folder for images name = os.path.splitext(os.path.basename(fname))[0] print(name) outfile = fname[:-4] + '.txt' device = TextCon(rsrcmgr, laparams=laparams, imagewriter=imagewriter, imagename=name) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: continue rows = [list(row) for row in device.rows] pages = max([row[0] for row in rows]) max_y = max([row[4] for row in rows]) min_y = min([row[2] for row in rows]) list_0 = [int(row[4]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] max_y2 = max(list_1) list_0 = [int(row[2]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] min_y2 = min(list_1) print('max_ys:', max_y - max_y2) print('min_ys:', min_y - min_y2) # Get max and min the hard way because of stupid headers list_0 = [int(row[3]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: max_x = max(list_1) else: max_x = max([int(row[3]) for row in device.rows]) list_0 = [int(row[1]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: min_x = min(list_1) else: min_x = min([int(row[3]) for row in device.rows]) # Errors if more pics on one side then other # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in # device.rows])/len(device.rows)) mid_x = (max_x + min_x) / 2 # mid_x = 595/2 # center of A4 at 72px/in Letter would be 612/2 l_height = sum([row[4] - row[2] for row in rows]) / len(rows) # print('max_x:', max_x) # print('min_x:', min_x) # print('mid_x:', mid_x) print('l_height:', l_height) column2 = [] lines = [] pagenumber = 0 table_caps = ['\n'] table_data = [] table = False for i, row in enumerate(rows): #l_height = row[4]-row[2] l_space = rows[i - 1][2] - row[4] #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5])) if row[0] == pagenumber + 1: lines += column2 column2 = [] pagenumber += 1 if row[0] == pagenumber: if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height: # capture Table (assuming tables will span all columns) if re.match(r"^table", str(row[5]), re.I): table = True table_caps.append(str(row[5])) table_data.append('\n') table_data.append(str(row[5])) table_data.append('\n') continue else: table = False # capture table captions multi lines elif (table_caps[-1] == str(rows[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): table_caps[-1] += str(row[5]) table_data[-2] += str(row[5]) continue if table: # capture table data if int(rows[i - 1][2]) == int(rows[i][2]): table_data[-1] += '\t' + str(row[5]) continue else: table_data.append(str(row[5])) continue elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and int(rows[i - 1][3]) < mid_x) or (int(rows[i - 1][1]) > mid_x and int(rows[i - 1][3]) > mid_x) or rows[i - 1][3] > max_x * 0.9 or l_space > 2.5 * l_height): """ r_space > c_space or previous[3] > max_x * 0.9 or l_space > 2 * l_height):""" if len(column2) > 0: if 1 > (row[2] - column2[-1][2]) > -1: # join if on same line if int(row[1]) < int(column2[-1][1]): column2[-1][5] = row[5] + " " + column2[-1][5] else: column2[-1][5] = column2[-1][5] + " " + row[5] else: column2.append(row) else: column2.append(row) # print(2, str(row[5])) else: if len(lines) > 0: if 1 > (row[2] - lines[-1][2]) > -1: # join if on same line if int(row[1]) < int(lines[-1][1]): lines[-1][5] = row[5] + " " + lines[-1][5] else: lines[-1][5] = lines[-1][5] + " " + row[5] else: lines.append(row) else: lines.append(row) # print(3, str(row[5])) # add final column lines += column2 fig_caps = ['\n'] headers = ['\n'] footers = ['\n'] supp_info = ['\n'] new_lines = [] supp_re = re.compile( r"Corresponding author|Electronic mail|email" "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I) for i, line in enumerate(lines): #l_height = lines[i][4]-lines[i][2] l_space = lines[i - 1][2] - lines[i][4] l_space_below = 0 l_space_2below = 0 if i + 1 < len(lines): l_space_below = lines[i][2] - lines[i + 1][4] if i + 2 < len(lines): l_space_2below = lines[i + 1][2] - lines[i + 2][4] fig = fig_caps[-1] print(l_space, l_space_below, l_space_2below, lines[i][2], lines[i][4], str(line[5])) # capture figure captions multi lines if (fig_caps[-1] == str(lines[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): fig_caps.append(str(line[5])) continue # capture headers (up to two lines) if (lines[i][2] > max_y * 0.95 and (l_space_below > 0.5 * l_height or l_space_2below > 0.5 * l_height)): headers.append('\n') headers.append(str(line[5])) if supp_re.search(str(line[5])): headers.append('\n') headers.append(str(line[5])) else: continue # capture supporting info if supp_re.search(str(line[5])): print(str(line[5])) supp_info.append('\n') supp_info.append(str(line[5])) continue if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height: # capture figure captions if re.match(r"^fig", str(line[5]), re.I): fig_caps.append('\n') fig_caps.append(str(line[5])) continue # capture footers elif lines[i][2] < min_y + max_y * 0.015: footers.append('\n') footers.append(str(line[5])) continue else: string = str(lines[i - 1][5]) if (any(string in s for s in fig_caps) or any(string in s for s in headers)): # or #string == footers[-1] or string == supp_info[-1]): pass else: new_lines.append('\n') new_lines.append(str(line[5])) with open(outfile, 'w') as f: f.write(' '.join(new_lines)) f.write('\n\nFigures') f.write(' '.join(fig_caps)) f.write('\n\nTables') #f.write(' '.join(table_caps)) f.write('\n'.join(table_data)) f.write('\n\nHeaders') f.write(' '.join(headers)) f.write('\n\nFooters') f.write(' '.join(footers)) f.write('\n\nSupporting Info') f.write(' '.join(supp_info)) # the histogram of the data # n, bins, patches = plt.hist(x_data, 50) # plt.show() device.close() print('Done') return
import pandas as pd from bs4 import BeautifulSoup def convert(case, pdfpath, targetfilepath, pages=100): if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True laparams = LAParams(all_texts=True) laparams.boxes_flow = -0.5 # laparams.paragraph_indent = 0.2 laparams.detect_vertical = True # laparams.heuristic_word_margin = 0.03 laparams.line_overlap = 0.2 laparams.word_margin = 0.2 laparams.line_margin = 0.5 laparams.char_margin = 1000.0 if case == 'text': output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) if case == 'HTML': output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec,