def process_pdf(file): #Creating the required objects resource_manager = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource_manager, laparams=laparams) page_interpreter = PDFPageInterpreter(resource_manager, device) #This list will contain the text at each page of the document. pdfText = list() #Processing each page in the pdf. for page in PDFPage.get_pages(file): page_interpreter.process_page(page) layout = device.get_result() text = "" for element in layout: # Whenever, we encounter the layout type as text box, we get the text. # This is to skip images if any. if isinstance(element, LTTextBox): text += element.get_text() pdfText.append(text) #Returing a list, where element at each index contains the text at each page return pdfText
def __init__(self, pdf, codec='utf-8'): """ Parameters: -------------- codec: codific, default utf-8 pdf: path to the pdf file Attributes: --------------- records: list of lines from the pdf file text: string of joined records, default "" didascalies: list of found didascalies with regexpr nimages: int, number of found images """ self.pdf = pdf self.text = "" self.records = [] self.didascalies = [] self.nimages = 0 self.images = [] parser = PDFParser(pdf) #parser = PDFParser(open(pdf, 'rb')) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object # that stores shared resources. rsrcmgr = PDFResourceManager() # Create a buffer for the parsed text retstr = StringIO() # Spacing parameters for parsing laparams = LAParams() self.codec = codec device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #images img_device = PDFPageAggregator(rsrcmgr, laparams=laparams) img_interpreter = PDFPageInterpreter(rsrcmgr, img_device) for page in PDFPage.create_pages(document): img_interpreter.process_page(page) pdf_item = img_device.get_result() if pdf_item is not None: for thing in pdf_item: if isinstance(thing, LTImage): self.save_image(thing) if isinstance(thing, LTFigure): self.find_images_in_thing(thing) lines = retstr.getvalue().splitlines() for line in lines: self.records.append(line)
def _extract_pdf_scores(stream): # these laparams seem to work ok with the ILIAS default PDF # formatting as well as with UR custom styling. # see pdf/tests/default_style.pdf and pdf/tests.ur_style.pdf laparams = LAParams(line_overlap=0, char_margin=20, word_margin=0.1, boxes_flow=0, detect_vertical=False) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(stream) document = PDFDocument(parser) page = next(PDFPage.create_pages(document)) interpreter.process_page(page) layout = device.get_result() boxes = [] table_head_y = None # y position of result table header order_name = "Reihenfolge" # FIXME localize for element in layout: if isinstance(element, LTTextBoxHorizontal): boxes.append(element) if order_name in element.get_text().strip(): table_head_y = element.y0 tboxes = list(filter(lambda box: box.y0 == table_head_y, boxes)) # if LAParams is set correctly, head should extract the whole # results table's text now. table = tboxes[0].get_text().replace('\t', '') table = table[table.find(order_name):] # note: question titles might lack spaces; this is no problem # since we compare question names and scores only through # Result.normalize_question_title() later. scores = dict() cols = [] for line in table.split("\n")[1:]: cols += re.split(r'\s+', line) if len(cols) >= 6: scores[cols[2]] = cols[4] cols = cols[6:] return scores
def __init__(self, ofile): rsrcmgr = PDFResourceManager() laparams = LAParams() self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) self.last_font = None self.in_rule = False self.font_print_pending = False self.header_footer_skipping = False self.ofile = ofile
def convert(pdffile): my_file = pdffile extracted_text = "" # Open and read the pdf file in binary mode fp = open(my_file, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() #close the pdf file fp.close() # print (extracted_text.encode("utf-8")) with open(log_file, "wb") as my_log: my_log.write(extracted_text.encode("utf-8")) print("Done !!")
def main(args): msg(SCRIPT, args) if len(args) != 1: msg('Parse a PDF file and print some pdfminer-specific stats') msg('Usage:', SCRIPT, '<PDF-filename>') return 1 infilename, = args lt_types = collections.Counter() with open(infilename, 'rb') as pdf_file: # Create a PDF parser object associated with the file object. parser = PDFParser(pdf_file) # Create a PDF document object that stores the document structure. # Supply the password for initialization. password = '' document = PDFDocument(parser, password) # Check if the document allows text extraction. if not document.is_extractable: raise PDFTextExtractionNotAllowed(filename) # Make a page iterator pages = PDFPage.create_pages(document) # Set up for some analysis rsrcmgr = PDFResourceManager() laparams = LAParams( detect_vertical=True, all_texts=True, ) #device = PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Look at all (nested) objects on each page for page_count, page in enumerate(pages, 1): # oh so stateful interpreter.process_page(page) layout = device.get_result() lt_types.update(type(item).__name__ for item in flat_iter(layout)) msg('page_count', page_count) msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
def pdfparser(filename): fp = open(filename, 'rb') rsrcmgr = PDFResourceManager() codec = 'utf-8' laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. paginas = [] for page in PDFPage.get_pages(fp, check_extractable=False): interpreter.process_page(page) layout = device.get_result() pagina = [] for element in layout: if isinstance(element, LTTextBox) or isinstance(element, LTText): texto = element.get_text() if texto: if len(texto) < 2: pagina[-1] += texto else: pagina += texto.split("\n") # limpa linhas vazias pagina_limpa = [] for linha in pagina: if linha.strip(): pagina_limpa.append(linha.strip()) if len(pagina_limpa) > 0: paginas.append(pagina_limpa) text_filename = os.path.splitext(filename)[0] + '.txt' textfile = open(text_filename, 'w') # remove header header_candidato = '' for pagina in paginas: if pagina[0].strip() != header_candidato: header_candidato = pagina[0].strip() else: pagina.pop(0) for linha in pagina: textfile.write("%s\n" % linha) textfile.close()
def convert_pdf_to_txt(path): fp = open(path, 'rb') txt = '' parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): txt += lt_obj.get_text() return (txt)
def parse(path): fp = open(path, 'rb') # 以二进制读模式打开 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) fulltext = [] # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() str_page = "" for x in layout: if isinstance(x, LTTextBoxHorizontal): results = x.get_text() str_page += results fulltext.append(str_page) return fulltext
from pdfminer3.layout import LAParams, LTTextBox, LTLine, LTTextLine from pdfminer3.pdfpage import PDFPage from pdfminer3.pdfinterp import PDFResourceManager from pdfminer3.pdfinterp import PDFPageInterpreter from pdfminer3.converter import PDFPageAggregator extracted_text = '' fp = open('C:\\Users\\Ritvik\\Desktop\\Tekoaly\\PDF\\768686236423.pdf', 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: print('Processing next page...') interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() print(extracted_text) # for lobj in layout: # if isinstance(lobj, LTTextBox): # x, y, text = lobj.bbox[0], lobj.bbox[3], lobj # print(' text: %s' % (text) ,end=' ')
class VhdlSpecParser(): FONT_TRANSLATION = {"HEFBHG+TimesNewRomanPS-ItalicMT": "it", "HEFBAE+TimesNewRomanPS-BoldMT": "b", "HEFBBF+TimesNewRomanPSMT": None, None:None} FOOTER_STR = 'Copyright © 2009 IEEE. All rights reserved.' def __init__(self, ofile): rsrcmgr = PDFResourceManager() laparams = LAParams() self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) self.last_font = None self.in_rule = False self.font_print_pending = False self.header_footer_skipping = False self.ofile = ofile def parse_page(self, page): self.interpreter.process_page(page) layout = self.device.get_result() self.parse_obj(layout._objs) def parse_obj(self, objs): font_translation = self.FONT_TRANSLATION for obj in objs: if isinstance(obj, pdfminer3.layout.LTTextBox): for o in obj._objs: if isinstance(o, pdfminer3.layout.LTTextLine): if self.header_footer_skipping: text = o.get_text() if text.startswith("Std 1076-"): self.header_footer_skipping = False continue text = o.get_text() if text.startswith(self.FOOTER_STR): self.header_footer_skipping = True continue # print(text) is_rule_header = "::=" in text if is_rule_header or self.in_rule: self.in_rule = True if not is_rule_header: if text and not text.startswith(" "): self.in_rule = False continue if text.strip(): for c in o._objs: if isinstance(c, pdfminer3.layout.LTChar) and self.last_font != c.fontname: # this character has different font need to propagate it to output self.font_print_pending = True if c.get_text().isspace() and font_translation[self.last_font] is not None: # print the font enclosing string directly after this word (ignore whitespaces behind) self.font_print_pending = True self.ofile.write("</%s>" % f) self.last_font = None if self.font_print_pending and not (c.get_text().isspace()): self.font_print_pending = False f = font_translation[self.last_font] if f: self.ofile.write("</%s>" % f) f = font_translation[c.fontname] if f: self.ofile.write("<%s>" % f) self.last_font = c.fontname self.ofile.write(c.get_text()) # if it's a container, recurse elif isinstance(obj, pdfminer3.layout.LTFigure): parse_obj(obj._objs) else: pass
for child in layout_obj: boxes.extend(find_textboxes_recursively(child)) return boxes return [] # その他の場合は空リストを返す。 # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams(detect_vertical=True) # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager() # ページを集めるPageAggregatorオブジェクトを作成。 device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 interpreter = PDFPageInterpreter(resource_manager, device) # 出力用のテキストファイル # output_txt = open('output.txt', 'w') def print_and_write(txt): print(txt) # output_txt.write(txt) # output_txt.write('\n') with open(sys.argv[1], 'rb') as f:
def _createDeviceInterpreter(self): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter