def convert_pdf2txt(input_path: str, output_path: str, verbose=params["DEFAULT_VERBOSE"]) -> None: for file in tqdm(glob.glob(input_path + '*.pdf'), ascii=True, desc='pdf->txt'): try: fp = open(file, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) retstr = StringIO() # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) result = device.get_result() data = retstr.getvalue() print("RESULT:", result) print("DATA:", data) txt_file = output_path + file.split("/")[-1] + '.txt' if txt_file not in os.listdir(output_path): txt_out = open(txt_file, "w") txt_out.write(data) except Exception as e: print(e) print("Text document could not be created from %s" % (file))
def get_text_rows(path): rows = defaultdict(list) # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. # parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter # document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. # if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() laparams.line_overlap = 0.01 laparams.line_margin = 0.01 laparams.word_margin = 0.15 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs, page): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): rows[(page, -int(obj.bbox[1]))].append( (int(obj.bbox[0]), sanitize(obj.get_text()))) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs, page) # loop over all pages in the document for page_num, page in enumerate(PDFPage.get_pages(fp)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs, page_num) for key in sorted(rows): rows[key] = sorted(rows[key]) page, y = key y = -y yield (page, y, rows[key])
def __init__(self, rsrc, outfp, codec='utf-8'): PDFDevice.__init__(self, rsrc) self.outfp = outfp self.codec = codec self.pageno = 0 self.tag = None return
def text_extraction(self): complete_report = [] open_pdf_file = open(self.pdf_name, 'rb') pdf_parser = PDFParser(open_pdf_file) document = PDFDocument(pdf_parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): pages_list = [] interpreter.process_page(page) layout = device.get_result() pages_list = self.parse_layout_obj_page_wise(layout._objs,pages_list) complete_report.append(pages_list) return complete_report
def handle_files(pdf_file, uTextList): #Handles File De-Identification and Passes the Uploaded Doc to Readctor Class print(pdf_file) newdoc = Document(docfile=pdf_file) newdoc.save() global docfileName docfileName = newdoc.docfile.name.rsplit('/', 1)[-1] # Create a PDF parser object associated with the file object. parser = PDFParser(pdf_file) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter if Password Protected PDF document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs) global text_cord_dict, coord, text_list text_cord_dict = dict(zip(cord_list, text_list)) # Initializing The ReadctorOptions Class options = RedactorOptions() options.content_filters = [] for coord, textlist in text_cord_dict.items(): for i in uTextList: for j in textlist: if i in j: options.content_filters += [ #First convert all dash-like characters to dashes. (re.compile(i), lambda m: "XXXXXX"), ] # Call to readctor Function redactor(options, docfileName)
def readPDFMinerTexts(fileObj): text_dict = {} file_pointer = open(fileObj, 'rb') parser = PDFParser(file_pointer) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed resourceManager = PDFResourceManager() device = PDFDevice(resourceManager) laparams = LAParams() device = PDFPageAggregator(resourceManager, laparams = laparams) interpreter = PDFPageInterpreter(resourceManager, device) page_num = 1 id = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for layout_obj in layout: if isinstance(layout_obj, LTTextBoxHorizontal): text_dict[id] = layout_obj.get_text() id += 1 page_num += 1 return text_dict
def parse_page(self): # Create a PDF resource manager object that stores shared resources rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = list() page_count = 0 for i, page in enumerate(PDFPage.create_pages(self.document)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() if self.pages: if page_count == self.pages: break page_count += 1 self.parse_layout_objects(layout._objs, (i+1))
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_book = path_project + os.sep + "input" + os.sep + "Contemporary Fixed Prosthodontics, 5ed.pdf" path_pdf_out = path_project + os.sep + "output" + os.sep + "pdf_result" # Open a PDF file. fp = open(path_book, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() laparams = LAParams() # Create a PDF device object. device = PDFDevice(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. cnt = 0 for page in PDFPage.create_pages(document): if cnt < 10: interpreter.process_page(page) layout = device.get_result() cnt += 1 else: break
def parse_pdf(self, file_name, start_page, end_page, save_folder): '''parse pdf to list of lists and save to csv''' fp = open(file_name, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 first_table = [] second_table = [] origen = '' for page in PDFPage.create_pages(document): if start_page <= i <= end_page: interpreter.process_page(page) layout = device.get_result() self.parse_obj(layout._objs, i) i += 1 ListOfStrings.sort(key=lambda x: (-x[0][1], x[0][0])) required_section = False for a in ListOfStrings: if a[0][0] == 17 and a[0][1] == 482: origen = a[1] if a[0][0] == 17 and '----' in a[1] and required_section: required_section = False if a[0][0] == 17 and '----' in a[1] and not required_section: required_section = True if 70 < a[0][0] < 72 and '----' not in a[1] and 'Container' not in a[1]: temp_table_A = [''] * 6 temp_table_A[0] = ListOfStrings[ListOfStrings.index(a)][1].split()[0] temp_table_A[1] = ListOfStrings[ListOfStrings.index(a)][1].split()[1] temp_table_A[2] = ListOfStrings[ListOfStrings.index(a) + 1][1].split()[0] temp_table_A[3] = ListOfStrings[ListOfStrings.index(a) + 1][1].split()[1] temp_table_A[4] = ListOfStrings[ListOfStrings.index(a) + 2][1] temp_table_A[5] = origen if temp_table_A != [''] * 5: first_table.append(temp_table_A) if a[0][0] == 17 and '----' not in a[1] and 'Freight' not in a[1] and required_section: temp_table_B = [''] * 5 temp_table_B[0] = ListOfStrings[ListOfStrings.index(a)][1] temp_table_B[1] = ListOfStrings[ListOfStrings.index(a) + 1][1] temp_table_B[2] = ListOfStrings[ListOfStrings.index(a) + 2][1].split('.')[0] temp_table_B[3] = "" temp_table_B[4] = ListOfStrings[ListOfStrings.index(a) + 3][1].split('.')[0] if temp_table_B != [''] * 5: second_table.append(temp_table_B) ListOfStrings.clear() if first_table != []: self.create_csv(self.add_quotes_to_list(first_table), save_folder, file_name.split('/')[-1].split('.')[0] + '_A.csv', 'sep=,\ncontainer,seall number,tare,type,packages,ORIGEN\n') if second_table != []: self.create_csv(self.add_quotes_to_list(second_table), save_folder, file_name.split('/')[-1].split('.')[0] + '_B.csv', 'sep=,\nFreight/Charge ,Basis,Rated as,Prepaid,Collect\n')
def layout_pdf(self): # Headers self.headersDict = { **dict.fromkeys(next(self.headers), 'career'), **dict.fromkeys(next(self.headers), 'education'), **dict.fromkeys(next(self.headers), 'skill'), **dict.fromkeys(next(self.headers), 'interest'), } parser = PDFParser(self.fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() line_count = sum(isinstance(x, LTLine) for x in layout) # Segmentation by Line separators (LT Line) if line_count > 3: self.line_segmentation(layout) if not self.is_valid(): self.header_segmentation(layout) else: self.header_segmentation(layout)
def parse_page(document, images_folder): # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed print dir(document) # Create a PDF resource manager object that stores shared resources rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = list() page_count = 0 for i, page in enumerate(PDFPage.create_pages(document)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object if page_count == 2: break return parse_lt_objs(layout._objs, (i+1), images_folder) #text_content.append(parse_lt_objs(layout._objs, (i+1), images_folder)) page_count += 1
def textExtract(self, pdfFile, excelFile, lstPageNum): success = True lstSortedPageNum = sorted(lstPageNum) lastPage = lstSortedPageNum[-1] lstSortedPageNum = [x - 1 for x in lstSortedPageNum] try: workbook = xlsxwriter.Workbook(excelFile) with open(pdfFile, "rb") as pdf_file: pdf_reader = PdfFileReader(pdf_file) totalPDFPages = pdf_reader.numPages if lastPage > int(totalPDFPages): success = False msg = "Entered page number doesnot exist" return [success, msg] # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) pageNum = 0 for pagecontent in PDFPage.get_pages(pdf_file, lstSortedPageNum): pageNum += 1 interpreter.process_page(pagecontent) worksheet = workbook.add_worksheet("Page " + str(pageNum)) layout = device.get_result() excelrowNum = 0 for obj in layout: excelrowNum += 1 # if it's a textbox, print text and location if isinstance(obj, LTTextBoxHorizontal): df_extracted_Text = [ int(obj.bbox[0]), int(obj.bbox[1]), int(obj.bbox[2]), int(obj.bbox[3]), obj.get_text().replace('\n', '') ] for col_num, data in enumerate(df_extracted_Text): worksheet.write(excelrowNum, col_num, data) msg = "Text Extraction successfull and saved to excel" except: msg = "Text Extraction! Throwing error" success = False finally: workbook.close() return [success, msg]
def __init__(self, rsrcmgr): PDFDevice.__init__(self, rsrcmgr) self.last_state = None # contains (font, font_size, string) self.blocks = [] # current block # font, font size, glyph y, [chars] self.current_block = None
def process_pdf_path(fname, page_num='all'): """ Extract the path, which might be part of the ME, such as fraction line :param fname: :param page_num: :return: """ if page_num == 'all': raise Exception("Not support get all at once") def print_layout(l): """get all the path such as fraction line and line for radical """ for e in l: if isinstance(e, LTTextLineHorizontal) or isinstance(e, LTTextBoxHorizontal): # recursively get the path print_layout(e) elif isinstance(e, LTRect) or isinstance(e, LTLine): # LTLine related with table # TODO, store as the candidates for the fraction path_list.append(e) else: # LTCurve might be related to the figure and drawings if debug: print e, type(e) path_list = [] fp = open(fname, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.create_pages(document)): process_mark = (page_num == 'all' or page_num == i) if process_mark: interpreter.process_page(page) layout = device.get_result() print_layout(layout) if page_num == i: break crop_bbox = get_pdf_page_bbox_abandon(fname, page_num) # adjust the element bbox based on the crop bbox for path in path_list: adjust_element_bbox(path, crop_bbox) return path_list
def parsiraj(): # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) global tekst tekst = "" def parse_obj(lt_objs): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): #print(obj.get_text()) global tekst tekst += obj.get_text() # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs) # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object tekst = "" interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs) pages.append(tekst)
def parse_pdf(self, pdf_file_name_with_path, text_dump_filename): Logger.getLogger().info("Parsing file " + pdf_file_name_with_path) if self._pdf_parsed == True: raise Exception('Error! PDF already parsed and loaded.') # Open a PDF file. fp = open(pdf_file_name_with_path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object self._parse_obj(layout._objs) self._perform_sanity_check() self._dump_data_structures() if text_dump_filename is not None: self._dump_text(text_dump_filename) # If things came till here, successful parse self._pdf_parsed = True
def _construct_thumbnail(filename, thumbnail_Width, thumbnail_Height, destination_foldername): splitted_filename = filename.split('/') directoryname = l[1] pure_filename = l[2] # Open a PDF file. fp = open(filename, 'rb') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pageNomber = 1 for page in PDFPage.get_pages(fp): interpreter.process_page(page) # pagesCounts: receive the LTPage object for the pagesCounts nth page. if pageNomber > PAGE_COUNT: break interpreter.process_page(page) layout = device.get_result() originalPage_Width = layout.width originalPage_Height = layout.height Matrix = ones((originalPage_Width, originalPage_Height, 3), int) Matrix = multiply(Matrix, BACKGROUND_COLOR) size = (int(originalPage_Width), int(originalPage_Height)) for hbox in layout: #hbox can be: LTTextBox, LTFigure, LTLine, LTRect, LTImage if isinstance(hbox, pdfminer.layout.LTTextBoxHorizontal): _text_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTRect): _rect_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTLine): _line_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTImage): _logo_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTFigure): _figure_processing(hbox, Matrix) _construct_thumbnail_image(size, Matrix, thumbnail_Width, thumbnail_Height, destination_foldername) pageNomber += 1
def process_pdf_internal(fname, page_num='all'): """ Change from orignal name of process_pdf to process_pdf_internal get the raw character :param fname: :param page_num: :return: """ tmp_path = get_tmp_path(fname) cache_path = "%s.chars.%s.pkl"%(tmp_path, str(page_num)) if os.path.isfile(cache_path): try: return pickle.load(open(cache_path)) except Exception as e: print "load failed, get again" # global char_list char_list = [] if debug: print fname # Open a PDF file. fp = open(fname, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.create_pages(document)): process_mark = (page_num == 'all' or page_num == i) if process_mark: interpreter.process_page(page) layout = device.get_result() print_layout(layout, char_list) if page_num == i: break crop_bbox = get_pdf_page_bbox_abandon(fname, page_num) for char in char_list: if isinstance(char, LTChar): adjust_element_bbox(char, crop_bbox) with open(cache_path, 'w') as f: pickle.dump(char_list, f) return char_list
def _construct_thumbnail(filename, thumbnail_Width, thumbnail_Height, destination_foldername): splitted_filename = filename.split('/') directoryname = l[1] pure_filename = l[2] # Open a PDF file. fp = open(filename, 'rb') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams = laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pageNomber = 1 for page in PDFPage.get_pages(fp): interpreter.process_page(page) # pagesCounts: receive the LTPage object for the pagesCounts nth page. if pageNomber > PAGE_COUNT: break interpreter.process_page(page) layout = device.get_result() originalPage_Width = layout.width originalPage_Height = layout.height Matrix = ones((originalPage_Width,originalPage_Height,3),int) Matrix = multiply(Matrix,BACKGROUND_COLOR) size = (int(originalPage_Width),int(originalPage_Height)) for hbox in layout: #hbox can be: LTTextBox, LTFigure, LTLine, LTRect, LTImage if isinstance(hbox, pdfminer.layout.LTTextBoxHorizontal): _text_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTRect): _rect_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTLine): _line_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTImage): _logo_processing(hbox, Matrix) if isinstance(hbox, pdfminer.layout.LTFigure): _figure_processing(hbox, Matrix) _construct_thumbnail_image(size, Matrix, thumbnail_Width, thumbnail_Height, destination_foldername) pageNomber+=1
def parse_document(pdfname): # Open a PDF file.writer fp = open(pdfname, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs): # loop over the object list # textList = [] for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextLineHorizontal): # print("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', ' _'))) important(obj.get_text().replace('\n', ' _')) # textItem = { # 'text': obj.get_text().replace('\n', '_'), # 'count': 1 # } # if (obj.get_text().replace('\n', '_')) not in textList: # textList.append(obj.get_text().replace('\n', '_')) # else: # for item in textList: # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure) or isinstance( obj, pdfminer.layout.LTTextBox): parse_obj(obj._objs) # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs)
def parsepdf(self, filename, startpage, endpage): # Open a PDF file. fp = open(filename, 'rb') # Create Position List position_list = [] # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 # loop over all pages in the document for page in PDFPage.create_pages(document): if i >= startpage and i <= endpage: # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object # print(position_list) self.parse_obj(layout._objs, position_list, i) i += 1 position_list = pd.DataFrame(position_list) position_list.columns = ["pos_x", "pos_y", "page", "text"] return (position_list)
def extract_block_text(filename, pages=[]): if not os.path.isfile(filename): raise FileNotFoundError(filename) with open(filename, "rb") as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # loop over all pages in the document data = [] if len(pages) == 0: for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs, data) else: for page_i in pages: for j, page in enumerate(PDFPage.create_pages(document)): # read the page into a layout object if j + 1 == page_i: interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs, data) return data
def GetScript(filename): global scriptName ResetGlobals() scriptName = filename password = "" # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "---Not translatable---" return #raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.boxes_flow = 2 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum,page in enumerate(PDFPage.create_pages(document)): if pgnum == 0: continue interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() text = [] for page in layout: try: if page.get_text().strip(): text.append(TextBlock(page.x0,page.y1,page.get_text().strip())) except: temp=5 print ".", text.sort(key = lambda row:(-row.y)) # Parse all of the "line" objects in each page for line in text: ParseLine(line.text, line.x)
def calculate_locations(filename,keywords): locations = [] fp = open(filename, 'rb') parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) pagenum = 0 reader = PdfFileReader(file(filename,"rb")) for page in pages: interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() page = reader.getPage(pagenum) x = page.trimBox[0].as_numeric() y = page.trimBox[1].as_numeric() #Handling special case if (x > 0 and y < 0): x = 0 # print "At page = %s X = %s , y = %s"%(pagenum,x,y) for keyword in keywords: print '********************************' co_ordinates = get_location(keyword,layout,x,y) print'Keyword %s , location %s'%(keyword,co_ordinates) print '********************************' if co_ordinates != None : for location in co_ordinates: print "PageNum-->%s"%pagenum l = LocationKeeper(keyword,location,pagenum) locations.append(l) pagenum+=1 return locations
def parsepdf(self, full_filename,dirname,filename, startpage, endpage): # Open a PDF file. fp = open(full_filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 # loop over all pages in the document for page in PDFPage.create_pages(document): #if i >= startpage and i <= endpage: # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object self.parse_obj(layout._objs) li.sort(key=lambda x:x[2]) if len(li) != 0: extract_tree(full_filename, dirname, filename, li, i) # print(li) del li[:] i += 1
def parsepdf(self): # Open a PDF file. fp = open(self.filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print('extraction not allowed') raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 # loop over all pages in the for page in PDFPage.create_pages(document): # int i to keep track of page numbers i+=1 # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object self.parse_page(layout._objs, i) return self.word_array #test = PDFpos("FinancialAccounting1.pdf") #test.parsepdf()
def readPdf(self, filePath): self.result = '' # 二进制读取文件 file = open(filePath, 'rb') # pdf解析器 parser = PDFParser(file) # pdf文档 doc = PDFDocument(parser) # 检测文档是否提供text转换 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 连接解析器和文档对象 # parser.set_document(doc) # doc.set_parser(parser) # 提供初始密码 # 没有密码创建一个空字符串 # doc.initialize() # 创建pdf资源管理器 resource_manager = PDFResourceManager() # pdf设备对象 laparams = LAParams() device = PDFDevice(resource_manager) device = PDFPageAggregator(resource_manager, laparams=laparams) # pdf解释器 interpreter = PDFPageInterpreter(resource_manager, device) pdf_str = '' # 遍历列表,每次处理一个page内容 for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for row in layout: if hasattr(row, "get_text"): self.result = self.result + (str(row.get_text())) + '\n' # if (isinstance(row, LTTextBoxHorizontal)): # with open('a.txt', 'a') as f: # f.write(row.get_text().encode('utf-8') + '\n') fileNames = os.path.splitext(filePath) if os.path.exists(fileNames[0] + '.txt'): return with open(fileNames[0] + '.txt', 'a') as f: f.write(self.result)
def get_pdf_page_bbox_abandon(fname, pid=0): """ Get the page number for the current pdf file NOTE that different page might have different number of pages could possible be the fraction lines, or the lines for the radical elements :param fname: :param pid: :return: tuple(left, xx, right, xx), only the last two value are valid for """ # Open a PDF file. fp = open(fname, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for i, page in enumerate(PDFPage.create_pages(document)): if i == pid: interpreter.process_page(page) return page.cropbox return None
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def init(filename, verbose=True): '''Initiate analysis objs ''' fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return document, interpreter, device
def readPdf(self): file1 = os.path.join(self.fPath, self.fileName) fp = open(file1, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) lt = [] lt1 = [] def parse_obj(lt_objs, pageNo): for obj in lt_objs: try: if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): lt.append(obj.get_text().replace('\n', '')) lt1.append([ obj.get_text().replace('\n', '').strip(), int(obj.bbox[0]), int(obj.bbox[1]), int(obj.bbox[3]), pageNo + 1 ]) #print(pageNo + 1,int(obj.bbox[2]),int(obj.bbox[3]),obj.get_text().replace('\n', '').strip()) except: pass for pageNumber, page in enumerate(PDFPage.get_pages(fp)): try: interpreter.process_page(page) layout = device.get_result() parse_obj(layout._objs, pageNumber) except: pass self.coordcont = lt1 self.content = lt return self.content
def with_pdf(path_book, images_folder): fp = open(path_book, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() # laparams = LAParams() # device = PDFDevice(rsrcmgr, laparams=laparams) device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] for i, page in enumerate(PDFPage.create_pages(document)): interpreter.process_page(page) # receive the LTPage object for this page layout = device.get_result() text_content.append(parse_lt_objs(layout, (i + 1), images_folder)) return text_content
def parse_pdf(pdf): rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) return { 'pages': parse_all_page_sizes(pdf, interpreter), 'fields': parse_all_annotations(pdf, interpreter) }
def pdf_to_txt(path): fp = open(path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable(): raise PDFPageAggregator else: rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(path[:-4] + '.txt', 'a') as f: f.write(x.get_text().encode('utf-8') + '\n')
def parsePDF(f): parser = PDFParser(f) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pobj in doc.get_pages(): interpreter.process_page(pobj) yield device.get_result()
def main(): # Open a PDF file. fp = open('Divani_Kebir-1.pdf', 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize() # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: print 'not extraction' return # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() if layout.pageid >= 2: break get_text(layout)
def pdf_to_text(pdf): """ Takes pdfminer PDFDocument and converts to plaintext. Returns a string. """ output = "" # create PDFMiner objects for data extraction rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # iterate over all pages, select textbox objects and extract plaintext for page in pdf.get_pages(): interpreter.process_page(page) layout = device.get_result() for element in layout: if isinstance(element, LTTextBox) or isinstance(element, LTTextLine): output += element.get_text() return output
def getPdfPages(path): with open(path, 'r') as fd: parser = PDFParser(fd) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) pages.append({ 'layout' : device.get_result() }) #for group in layout.groups: # if group.get_text().startswith('section'): # print(dir(page)) return pages
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator import pickle # Open a PDF file. fp = open('full.pdf', 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = list(enumerate(PDFPage.create_pages(document))) pages_length = len(pages) print 'Created page list of ' + str(pages_length) + ' pages' def sort_text (text): return (height - text.y1) * 1000000 + text.x0