document = open('1a_Foundations.pdf', 'rb') #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) # interpreter.process_page(pages) #page is the iterator of the pages, it is for one single page object title = [] content = [] for page in PDFPage.get_pages(document): interpreter.process_page(page) layout = device.get_result() if layout.pageid > 1: # print ("aaa") if parse_obj(layout._objs) == True: text_title = parse_obj_title(layout._objs).encode( 'ascii', 'ignore') text_title = text_title[:-1] title.append(text_title) text_content = parse_obj_content(layout._objs) content.append(text_content) with open('title111.txt', 'w', encoding='utf-8') as f: for i in title: f.write(i) f.write('\n')
from pdfminer3.layout import LAParams, LTTextBox from pdfminer3.pdfpage import PDFPage from pdfminer3.pdfinterp import PDFResourceManager from pdfminer3.pdfinterp import PDFPageInterpreter from pdfminer3.converter import PDFPageAggregator from pdfminer3.converter import TextConverter import io resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open('test.pdf', 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() print(text)
def run(self): super(XML, self).run() if self.xml_type == 'letter': resource_manager = PDFResourceManager() fake_file_handle = io.BytesIO() converter = XMLConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(self.output_file_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue().decode('utf-8') text += '\n</pages>' self.output_xml_content = text # close open handles converter.close() fake_file_handle.close() super(XML, self).deleteOutputFile() else: def __extract_text_by_page(pdf_path): with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8') page_interpreter = PDFPageInterpreter( resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() # text = text.encode('utf-8') yield text # close open handles converter.close() fake_file_handle.close() def __replace_nontext(text, replacement=u'\uFFFD'): _char_tail = '' if sys.maxunicode > 0x10000: _char_tail = u'%s-%s' % ( chr(0x10000), chr(min(sys.maxunicode, 0x10FFFF))) _nontext_sub = re.compile( r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD%s]' % _char_tail, re.U).sub return _nontext_sub(replacement, text) root = xml.Element('{filename}'.format(filename='Result')) pages = xml.Element('Pages') root.append(pages) counter = 1 for page in __extract_text_by_page(self.output_file_path): text = xml.SubElement(pages, 'Page_{}'.format(counter)) text.text = page[:] counter += 1 #root.append(pages) tree = xml.ElementTree(root) #xml_string = xml.tostring(tree, 'utf-8', method='xml') xml_string = xml.tostring(root, 'utf-8', method='xml') xml_string = __replace_nontext( xml_string.decode('utf-8'), replacement=u'\uFFFD').encode('utf-8') #xml_string += '\n</pages>' parsed_string = minidom.parseString(xml_string) pretty_string = parsed_string.toprettyxml(indent=' ') #pretty_string += '\n</pages>' # with open(output_file_path, 'w', encoding="utf-8") as f: # f.write(pretty_string) self.output_xml_content = pretty_string super(XML, self).deleteOutputFile()
interpreter = PDFPageInterpreter(resource_manager, device) # 出力用のテキストファイル # output_txt = open('output.txt', 'w') def print_and_write(txt): print(txt) # output_txt.write(txt) # output_txt.write('\n') with open(sys.argv[1], 'rb') as f: # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。 # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。 for page in PDFPage.get_pages(f): print_and_write('\n====== ページ区切り ======\n') interpreter.process_page(page) # ページを処理する。 layout = device.get_result() # LTPageオブジェクトを取得。 # ページ内のテキストボックスのリストを取得する。 boxes = find_textboxes_recursively(layout) # テキストボックスの左上の座標の順でテキストボックスをソートする。 # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。 boxes.sort(key=lambda b: (-b.y1, b.x0)) for box in boxes: print_and_write('-' * 10) # 読みやすいよう区切り線を表示する。 print_and_write(box.get_text().strip()) # テキストボックス内のテキストを表示する。 print_and_write(f"{box.x0}, {box.y0} - {box.x1}, {box.y1}"
def scanDoublePages(parent, galleryprices, dailyprices): daily = open("daily.txt", "w+") gal = open("gallery.txt", "w+") sweet = open("sweet.txt", "w+") duplicatestest = open("duplicatestest.txt", "w+") number = 0 numberlist = [] originalfile = [] folder = "temp" cleanFolder(folder) # listing the files inside the folder parentnew = list_files_walk(parent) # creating a temporary folder os.mkdir(folder) # splitting the temporary files splitter(parentnew, folder) # getting the temporary files parentnew2 = list_files_walk(folder) # sorting the files by name parentnew2.sort() # iterate over all the files in directory 'parent' for file_name in parentnew2: resource_manager = PDFResourceManager() handle = io.StringIO() converter = TextConverter(resource_manager, handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) arquivo = open(file_name, 'rb') if "page_001.pdf" in file_name: number = 0 with arquivo as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): booleangal = True booleanSweet = True page_interpreter.process_page(page) text = handle.getvalue() text = text[:-1] text = text + "¬¬¬" #print(text) # searching the reference number search = find_between(text, "#", "Order") # searching the order number search2 = find_between(text, "# ", "Order Date") # Searching the design name name = find_between(text, "SKUPrice1", "$") # Prices price = find_between(text, name, ",") # Products products = find_between(text, "SKUPrice1", "¬¬¬") #print(products) originalfilenumber = find_between(file_name, "_file_", "_page") print(originalfilenumber) if search == "": numberlist.append(str(number) + ";" + originalfilenumber) originalfile.append(originalfilenumber) # print(result[number-1]) # f.write(result[number - 1] + "\n") else: duplicatestest.write(search2 + "\n") for daprices in dailyprices: if products.find(daprices) != -1: print(search2 + " Daily Shirt") booleangal = False daily.write(name + "^" + file_name + "^" + search2 + "\n") break if booleangal: for gaprices in galleryprices: if products.find(gaprices) != -1: print(search2 + " Gallery Shirt") gal.write(name + "^" + file_name + "^" + search2 + "\n") booleanSweet = False break if booleanSweet: sweet.write(name + "^" + file_name + "^" + search2 + "\n") print(search2 + " Sweet Deal") number = number + 1 converter.close() handle.close() daily.close() gal.close() duplicatestest.close() sweet.close() cleanFolder(folder) print(originalfile) print("Files with double pages: ") print(numberlist) os.mkdir(folder) splitterCustom(parentnew, folder, numberlist,originalfile)