def pdfextract(pdf): with open(pdf, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) # print file name and raise error for unextractable files if not doc.is_extractable: print(pdf) raise PDFTextExtractionNotAllowed output_string = io.StringIO() rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams(detect_vertical=True)) interpreter = PDFPageInterpreter(rsrcmgr, device) # Drop first page w/abstract and header info and some references if not a very short .pdf if (resolve1(doc.catalog['Pages']) != None): doclength = resolve1(doc.catalog['Pages'])['Count'] for pageNumber, page in enumerate(PDFPage.get_pages(in_file)): if (pageNumber > 0) and (pageNumber < doclength) and (doclength > 2): interpreter.process_page(page) else: for pageNumber, page in enumerate(PDFPage.get_pages(in_file)): interpreter.process_page(page) return output_string.getvalue()
def page_length(fd): """ Get the page length of the given PDF. """ parser = PDFParser(fd) document = PDFDocument(parser) return resolve1(document.catalog["Pages"])["Count"]
def pdf_converter(self, input_filename, output_filename): input_file = open(input_filename, "rb") pdf_parser = PDFParser(input_file) pdf_document = PDFDocument(pdf_parser) page_count = range(resolve1(pdf_document.catalog["Pages"])["Count"]) string_io = io.StringIO() if not page_count: page_number_set = set() else: page_number_set = set(page_count) resource_manager = PDFResourceManager() converter = TextConverter(resource_manager, string_io, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) for page in PDFPage.get_pages(input_file, page_number_set, caching=True, check_extractable=True): page_interpreter.process_page(page) output_file = open(output_filename, "w") output_file.write(string_io.getvalue().replace("\n\n", "\n")) input_file.close() converter.close() output_file.close()
def extract_text(file_path): out = StringIO() # Preparing for reading pdf file manager = PDFResourceManager() converter = TextConverter(manager, out, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) file = open(file_path, 'rb') # Getting page numbers as list parser = PDFParser(file) document = PDFDocument(parser) pages = set(range(resolve1(document.catalog['Pages'])['Count'])) # Getting each page's text for page in PDFPage.get_pages(file, pages): interpreter.process_page(page) file.close() converter.close() text = out.getvalue() out.close() # Clearing the string text = re.sub('\s\s+', ' ', text) text = text.lower() text = text.replace('\n', ' ') exclude = set(string.punctuation) exclude.add('®') text = ''.join(ch for ch in text if ch not in exclude) return text
def convert(fname, pages=None): infile = file(fname, 'rb') content = "" parser = PDFParser(infile) document = PDFDocument(parser) # This will give you the count of pages count = (resolve1(document.catalog['Pages'])['Count']) if not pages: pagenums = set() else: pagenums = count # Check : print ('converting......') codec = 'utf-8' laparams = LAParams() output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def count_pages(): file = open(sys.argv[1], 'rb') parser = PDFParser(file) document = PDFDocument(parser) # This will give you the count of pages return (resolve1(document.catalog['Pages'])['Count'])
def page_count(pdf_path): file = open(pdf_path, 'rb') parser = PDFParser(file) document = PDFDocument(parser) # This will give you the count of pages print(document.get_dest) print(resolve1(document.catalog['Pages'])['Count'])
def getNumberPages(self, pdf_path: str) -> int: """ Obtem o numero de paginas do arquivo pdf """ f = open(pdf_path, 'rb') parser = PDFParser(f) document = PDFDocument(parser) numPages = resolve1(document.catalog['Pages'])['Count'] f.close() return numPages
def prepare_to_parsing(file_name, folder): '''get`s pdf 2 last page values''' pdf_handler = PdfPositionHandling() file = open(file_name, 'rb') parser = PDFParser(file) document = PDFDocument(parser) len_of_pdf = resolve1(document.catalog['Pages'])['Count'] pdf_handler.parse_pdf(file_name, len_of_pdf - 2, len_of_pdf - 1, folder)
def extract_text(pub): '''Extracts text content from pdf using pdfminer.six, downloads pdf if non-existant :publication (article) from database ''' pdf_fn = pub['url'].split('/')[-1] pdf_path = pdf_src + pdf_fn # Allows for override of corrupted pdfs if os.path.isfile(pdf_path): pass else: # doesnt exist - download download_pdf(pdf_path, pub) # Page count for those without if pub['page count'] == 'N/A': pdf = open(pdf_path, 'rb') check = False while True: # try once try: parser = PDFParser(pdf) document = PDFDocument(parser) except Exception as e: if check is True: raise PSSyntaxError( f'{pdf_path} appears to be malformed and qpdf cannot repair it.' ) pa_print.tprint(str(e)) pa_print.tprint(f'Attempting to repair {pdf_path}') pike = pikepdf.Pdf.open(pdf_path, allow_overwriting_input=True) pike.save(pdf_path) check = True continue break pub['page count'] = resolve1(document.catalog['Pages'])['Count'] fn = pdf_fn.split('.')[0] miner_text_file = f'{text_src}miner/miner_{fn}.txt' # Read miner text if exists if os.path.isfile(miner_text_file): with open(miner_text_file, 'r') as f: doc = f.read() return doc else: # if not, make them pa_print.tprint(f'\nExtracting: {pdf_fn}') laparams = LAParams() setattr(laparams, 'all_texts', True) doc = extract_pdf(pdf_path, laparams=laparams) with open(miner_text_file, 'w') as f: f.write(doc) return doc
def get_pdf_pages_and_sizes(filename: str): """Ref https://stackoverflow.com/a/47686921""" with open(filename, "rb") as fp: parser = PDFParser(fp) document = PDFDocument(parser) num_pages = resolve1(document.catalog["Pages"])["Count"] page_sizes = [(int(page.mediabox[2]), int(page.mediabox[3])) for page in PDFPage.create_pages(document)] return num_pages, page_sizes
def count_pages(path): ''' Count the number of pages in the book ''' page_count_dict = {} for file in os.listdir(mypath): if file.endswith(".pdf"): with open(file, 'rb') as f: parser = PDFParser(f) book = PDFDocument(parser) page_count_dict[file] = resolve1(book.catalog['Pages'])['Count'] f.close() return page_count_dict
def get_page_count(pdf: PDFQuery) -> int: """Get the total page count of a PDF. Parameters ---------- pdf : PDFQuery The PDF Returns ------- int Total page count """ return resolve1(pdf.doc.catalog['Pages'])['Count']
def read_text(): ''' This function reads the harry potter books into python from a directory of pdfs. ''' all_books_text = [] mypath = os.getcwd() for file in os.listdir(mypath): if file.endswith(".pdf"): with open(file, 'rb') as f: parser = PDFParser(f) book = PDFDocument(parser) all_books_text.append(book) print(file, resolve1(book.catalog['Pages'])['Count']) f.close() return all_books_text
def __init__(self, cb, ia, term, file_path): self.cb = cb self.ia = ia self.search_term = term self.fp = open(file_path, 'rb') try: self.parser = PDFParser(self.fp) self.document = PDFDocument(self.parser) # Check if the document allows text extraction. If not, abort. if not self.document.is_extractable: raise PDFTextExtractionNotAllowed self.total_page_num = (resolve1( self.document.catalog['Pages'])['Count']) except: print("ERROR: Cannot open PDF File") self.fp.close() exit(1)
def parse(file): """ Args: file: """ output_string = StringIO() with open(file, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams(detect_vertical=True)) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) pages = str(resolve1(doc.catalog["Pages"])["Count"]) content = output_string.getvalue() return f"{pages} {content}"
def read_pdf_text(path, retured_value): output_string = StringIO() if path: with open(path, 'rb') as file: parser = PDFParser(file) fileDoc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) fileLen = resolve1(fileDoc.catalog['Pages'])['Count'] counter = 0 for page in PDFPage.create_pages(fileDoc): precent = int(round(((counter + 1) / fileLen) * 100)) print(f"reading at {precent}%") interpreter.process_page(page) counter += 1 retured_value = output_string return retured_value
def convert(self): with open(self._pdfname, 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser, '') n_pages = pdfinterp.resolve1(document.catalog['Pages'])['Count'] with ThreadPoolExecutor() as executor: executor.map(self._task, range(n_pages)) for i in range(n_pages): slide = self.presentation.slides.add_slide(self._layout) slide.shapes.add_picture( '{}/page-{}.png'.format(self._tempdir, i), 0, 0, self._width, self._height, ) shutil.rmtree(self._tempdir) return self.presentation
def get_page_num(fpath): """ Get the page number for the current pdf file https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python """ tmp_path = get_tmp_path(fpath) cache_path = "{}.page_num.json".format(tmp_path) if os.path.isfile(cache_path): tmp_dict = load_general(cache_path) return tmp_dict['page_num'] # Open a PDF file. fp = open(fpath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) c = resolve1(document.catalog['Pages'])['Count'] tmp_dict = {'page_num': c} dump_general(tmp_dict, cache_path) return c
def number_of_pages(self): return resolve1(self.document.catalog["Pages"])["Count"]
def get_pages_in_pdf(file): document = PDFDocument(PDFParser(file)) return resolve1(document.catalog['Pages'])['Count']
name = os.path.basename(my_path)[:-4] size = os.stat(my_path).st_size parser = PDFParser(infile) doc = PDFDocument(parser) metadata = doc.info my_metadata = [] for key in [ 'Author', 'Category', 'Company', 'CreationDate', 'Subject', 'Title' ]: my_metadata.append(metadata[0].get(key)) page_count = resolve1(doc.catalog['Pages'])['Count'] for page in PDFPage.get_pages(infile, caching=False): interpreter.process_page(page) break infile.close() converter.close() text = output.getvalue() output.close() word_count = len(text.split()) my_info = [name, subdir, size, page_count, word_count ] + my_metadata my_info.append(text) my_dataframe.loc[len(my_dataframe)] = my_info bar.next() my_dataframe.to_csv(parent_dir + "_extracted_text.csv")
def get_meta_data(self): self.file = open(self.file_path, 'rb') self.parser = PDFParser(self.file) self.document = PDFDocument(self.parser) self.total_pages = resolve1(self.document.catalog['Pages'])['Count']
def get_pdf_totalpage(file): file = open(file, 'rb') parser = PDFParser(file) document = PDFDocument(parser) page_count = resolve1(document.catalog['Pages'])['Count'] return page_count
def contar_pags(caminho_arquivo: str) -> int: with open(caminho_arquivo, 'rb') as arquivo: analisador = PDFParser(arquivo) documento = PDFDocument(analisador) return resolve1(documento.catalog['Pages'])['Count']
def get_number_of_pages(pdf: PDFQuery): return resolve1(pdf.doc.catalog['Pages'])['Count']
# READING ALL DOWNLOADED PDF FILES IN PDF FOLDER for pdf_path in entries: try: images = pdf2image.convert_from_path('PDF/' + pdf_path) pil_im = images[ 0] # assuming that we're interested in the first page only ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT) text1 = " ".join(ocr_dict['text']) file = open('PDF/' + pdf_path, 'rb') parser = PDFParser(file) document = PDFDocument(parser) # This will give you the count of pages if resolve1(document.catalog['Pages'])['Count'] > 1: pil_im1 = images[1] ocr_dict1 = pytesseract.image_to_data(pil_im1, lang='eng', output_type=Output.DICT) text2 = " ".join(ocr_dict1['text']) else: text2 = '' # ocr_dict now holds all the OCR info including text and location on the image text = text1 + text2 regions = [ 'Dakar', 'Thiés', 'Diourbel', 'Fatick', 'Kaolack', 'Kaffrine', 'Touba', 'Kolda', 'Tamba', 'Ziguinchor', 'Saint-Louis', 'Matam', 'Sédhiou' ]
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import resolve1 file = open('/home/meddlin/git/cpat/metagoofil/REVELLE.pdf', 'rb') parser = PDFParser(file) document = PDFDocument(parser) # this line found here: # https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python?rq=1 print(resolve1(document.catalog['Pages'])['Count'])
def auto_table_extract(example_file): all_tables = list() #example_file = r"C:\Users\divesh.kubal\Downloads\H& B PDF\H& B PDF\16154.pdf" #my_pass = '******' file = open(example_file, 'rb') parser = PDFParser(file) #document = PDFDocument(parser,password=my_pass) document = PDFDocument(parser) total_pages = resolve1(document.catalog['Pages'])['Count'] # print('page numbers: ', total_pages) total_pages = resolve1(document.catalog['Pages'])['Count'] base_filename = basename(example_file) bs= base_filename #page_number1 = int(input('Enter Page Number: ')) #page_number = page_number1 - 1 #base_filename = base_filename.replace('.pdf','') + '_pg_' + str(page_number1) f = open('math_log.txt', 'a', encoding='utf-8') number_of_clusters_list = [] for page_number in range(0,total_pages): base_filename = base_filename.replace('.pdf', '') + '_pg_' + str(page_number) class pdfPositionHandling: xo = list() yo = list() text = list() def parse_obj(self, lt_objs): # loop over the object list for obj in lt_objs: if isinstance(obj, pdfminer.layout.LTTextLine): pdfPositionHandling.xo.append(int(obj.bbox[0])) pdfPositionHandling.yo.append(int(obj.bbox[1])) pdfPositionHandling.text.append(str(obj.get_text())) math_log = str(obj.bbox[0]) + ' ' + str(obj.bbox[1]) + ' ' + str(obj.get_text().replace('\n', '_')) f.write(math_log + '\n') # if it's a textbox, also recurse if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): self.parse_obj(obj._objs) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): self.parse_obj(obj._objs) def parsepdf(self, filename, startpage, endpage): # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) i = 0 # loop over all pages in the document for page in PDFPage.create_pages(document): if i >= startpage and i <= endpage: # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object self.parse_obj(layout._objs) i += 1 def table_without_border(): obj = pdfPositionHandling() obj.parsepdf(r'input_pdf.pdf', 0, 0) y0 = pdfPositionHandling.yo x0 = pdfPositionHandling.xo text = pdfPositionHandling.text from collections import defaultdict def list_duplicates(seq): tally = defaultdict(list) for i, item in enumerate(seq): tally[item].append(i) return ((key, locs) for key, locs in tally.items()) rep = list() for each_elem in y0: for each_elem2 in y0: if (math.fabs(each_elem - each_elem2) == 1): rep.append((each_elem, each_elem2)) for t in rep: for n, i in enumerate(y0): if i == t[0]: y0[n] = t[1] l = [] for dup in sorted(list_duplicates(y0), reverse=True): l.append(dup) table_df = pd.DataFrame([]) res_table = list() final_table = list() temp_text = '' final_table2 = list() for dup in sorted(list_duplicates(y0), reverse=True): for each_dup in dup[1]: text_append = str(text[each_dup]).replace('\n', '') text_append = text_append res_table.append(text_append) final_table.append(res_table) while ' ' in res_table: res_table.remove(' ') while ' ' in res_table: res_table.remove(' ') while ' ' in res_table: res_table.remove(' ') while '$' in res_table: res_table.remove('$') final_table2.append(res_table) res_table = [] for each_row in final_table: table_df = table_df.append(pd.Series(each_row), ignore_index=True) s_xo = list(set(x0)) s_xo = sorted(s_xo) for row in final_table2: if len(row) == 1: row.clear() number_of_clusters = len(max(final_table2, key=len)) if number_of_clusters<18 and number_of_clusters>15: number_of_clusters = 20 number_of_clusters_list.append(number_of_clusters) # import math if (int(math.fabs(number_of_clusters_list[0]-number_of_clusters))==1): number_of_clusters = number_of_clusters_list[0] #print(number_of_clusters) import numpy as np kmeans = KMeans(n_clusters=number_of_clusters) arr = np.asarray(x0) arr = arr.reshape(-1, 1) kmeansoutput = kmeans.fit(arr) centroids = kmeansoutput.cluster_centers_ new_centroids = list() centroids = centroids.tolist() for each_centroid in centroids: each_centroid = int(each_centroid[0]) new_centroids.append(each_centroid) new_centroids = sorted(new_centroids) new_centroids = sorted(new_centroids) #new_centroids = [21, 42, 80, 150, 199, 278, 339, 406, 433, 460, 515, 551] #number_of_clusters = number_of_clusters+1 rep = list() for each_elem in y0: for each_elem2 in y0: if (math.fabs(each_elem - each_elem2) < 6): #Minimum Distance for new Line rep.append((each_elem, each_elem2)) for t in rep: for n, i in enumerate(y0): if i == t[0]: y0[n] = t[1] l2 = list() table_df = pd.DataFrame([]) res_table = list() final_table = list() for i in range(0, number_of_clusters): res_table.append(' ') l2.append(' ') for dup in sorted(list_duplicates(y0), reverse=True): for each_dup in dup[1]: text_append = str(text[each_dup]).replace('\n', '') text_append = text_append.strip() text_append = re.sub(' +',' ',text_append) cluster = min(range(len(new_centroids)), key=lambda i: abs(new_centroids[i] - x0[each_dup])) # print('clusterr: ', text_append, cluster) # print ('res: ', res_table) leading_sp = len(text_append) - len(text_append.lstrip()) if (leading_sp>5): text_append = 'my_pdf_dummy' + ' '+text_append text_append_split = text_append.split(' ') text_append_split_res = [] for each_ss in text_append_split: if each_ss!='': each_ss = each_ss.replace('my_pdf_dummy',' ') text_append_split_res.append(each_ss) text_append = text_append.replace('my_pdf_dummy','') # print('tsss: ', text_append_split_res) if (res_table[cluster] != ' ' ): # print ('tt: ', text_append) # print ('tt: ', cluster) app = str(res_table[cluster] + text_append) res_table[cluster] = app #elif(len(text_append_split_res)>1 and res_table[cluster] != ' '): elif(len(text_append_split_res) > 1): ap = cluster for each_ss in text_append_split_res: try: res_table[ap]=each_ss ap = ap+1 except: res_table.insert(ap,each_ss) ap = ap + 1 else: res_table[cluster]=text_append #res_table.insert(cluster, text_append) for i in range(0, number_of_clusters): res_table.append(' ') if not all(' ' == s or s.isspace() for s in res_table): final_table.append(res_table) res_table = [] for i in range(0, number_of_clusters): res_table.append(' ') for each_row in final_table: table_df = table_df.append(pd.Series(each_row), ignore_index=True) all_tables.append(table_df) for page_number in range(0,total_pages): # print (page_number,"d") import PyPDF2# to write contents of pdf to new pdf page by page pfr = PyPDF2.PdfFileReader(open(example_file, "rb")) orientation = pfr.getPage(0).get('/Rotate') # print(orientation,"ori") try: pfr.decrypt('') except: pass if orientation==180 or orientation==270 or orientation==90: # print("in if") pdf_in = open(example_file, 'rb') pdf_reader = PyPDF2.PdfFileReader(pdf_in) pdf_writer = PyPDF2.PdfFileWriter() for pagenum in range(pdf_reader.numPages): page = pdf_reader.getPage(pagenum) # print(pagenum) page.rotateClockwise(360-orientation) pdf_writer.addPage(page) pdf_out = open('rotated5.pdf', 'wb') pdf_writer.write(pdf_out) pdf_out.close() pdf_in.close() pfr = PyPDF2.PdfFileReader(open("rotated5.pdf", "rb")) pg9 = pfr.getPage(page_number) #extract pg 8 writer = PyPDF2.PdfFileWriter() #create PdfFileWriter object #add pages writer.addPage(pg9) NewPDFfilename = "input_pdf.pdf" with open(NewPDFfilename, "wb") as outputStream: # create new PDF writer.write(outputStream) else: # print("in else") pg9 = pfr.getPage(page_number) # extract pg 8 writer = PyPDF2.PdfFileWriter() # create PdfFileWriter object # add pages writer.addPage(pg9) NewPDFfilename = "input_pdf.pdf" with open(NewPDFfilename, "wb") as outputStream: # create new PDF writer.write(outputStream) def extract_layout_by_page(pdf_path):#to get layouts of each page in pdf # print(pdf_path,"path") # print("hello") laparams = LAParams() fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layouts.append(device.get_result()) # print(layouts,"layout") return layouts page_layouts = extract_layout_by_page(NewPDFfilename) # print(page_layouts[0],"ff") TEXT_ELEMENTS = [ pdfminer.layout.LTTextBox, pdfminer.layout.LTTextBoxHorizontal, pdfminer.layout.LTTextLine, pdfminer.layout.LTTextLineHorizontal ] def flatten(lst): # print("list",lst) # print(lst) return [subelem for elem in lst for subelem in elem] def extract_characters(element): # print(element) # for i in element: # print(i) if isinstance(element, pdfminer.layout.LTChar): # print("in char") # print("1st") # print(element) # print(element) return [element] if any(isinstance(element, i) for i in TEXT_ELEMENTS): return flatten([extract_characters(e) for e in element]) # print(element) # if isinstance(element, list): # print(isinstance(element, list)) return flatten([extract_characters(l) for l in element]) return [] final_result = list() current_page = page_layouts[0] #print('PROCESSING PAGE : ', page_number) texts = [] rects = [] for e in current_page: # print(current_page,"fg") # print(e,"ghhh") if isinstance(e, pdfminer.layout.LTTextBoxHorizontal): texts.append(e) elif isinstance(e, pdfminer.layout.LTRect): rects.append(e) # print(rects,"rects") characters = extract_characters(texts) import matplotlib.pyplot as plt from matplotlib import patches def draw_rect_bbox(a, ax, color): """ Draws an unfilled rectable onto ax. """ # print(a[0],"hjjkk") ax.add_patch( patches.Rectangle( (a[0], a[1]), a[2] - a[0], a[3] - a[1], fill=False, color=color ) ) def draw_rect(rect, ax, color="black"): x0,y0,x1,y1=rect.bbox draw_rect_bbox((x0,y0,x1,y1), ax, color) xmin, ymin, xmax, ymax = current_page.bbox size = 6 fig, ax = plt.subplots(figsize=(size, size * (ymax / xmax))) for rect in rects: # print(rect,"hi") draw_rect(rect, ax) for c in characters: draw_rect(c, ax, "red") plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) # plt.show() # print(characters) # print(rects) xmin, ymin, xmax, ymax = current_page.bbox # print(xmin, ymin, xmax, ymax) size = 6 def width(rect): x0, y0, x1, y1 = rect.bbox # print(( x0, y0, x1, y1)) # print(min( x1 - x0, y1 - y0)) return min(x1 - x0, y1 - y0) def area(rect): x0, y0, x1, y1 = rect.bbox # print((x0, y0, x1, y1)) # print((x1 - x0) * (y1 - y0)) return (x1 - x0) * (y1 - y0) # for r in rects: # if width(r)<2: # print(r) def cast_as_line(rect): x0, y0, x1, y1 = rect.bbox # print( x0, y0, x1, y1) if x1 - x0 > y1 - y0: return (x0, y0, x1, y0, "H") else: return (x0, y0, x0, y1, "V") lines = [cast_as_line(r) for r in rects if width(r) < 2 and area(r) > 1] # print(lines)#identify horizontal and vertical lines xmin, ymin, xmax, ymax = current_page.bbox # print( xmin, ymin, xmax, ymax) size = 6 def does_it_intersect(x, xmin, xmax): #72.504 769.44 225.764 769.9200000000001 #77 return (x <= xmax and x >= xmin) def find_bounding_rectangle(x, y, lines): # print(lines) v_intersects=[] # for l in lines: # if l[4]=="V" and does_it_intersect(y, l[1], l[3]): # v_intersects.append(l) v_intersects = [l for l in lines if l[4] == "V" and does_it_intersect(y, l[1], l[3])] # print(v_intersects,"v0") h_intersects = [l for l in lines if l[4] == "H" and does_it_intersect(x, l[0], l[2])] # print(h_intersects, "h0") if len(v_intersects) < 2 or len(h_intersects) < 2: # print("ghjkl") return None v_left = [v[0] for v in v_intersects if v[0] < x] # print(v_left) v_right = [v[0] for v in v_intersects if v[0] > x] # print(v_right) if len(v_left) == 0 or len(v_right) == 0: return None x0, x1 = max(v_left), min(v_right) h_down = [h[1] for h in h_intersects if h[1] < y] h_up = [h[1] for h in h_intersects if h[1] > y] if len(h_down) == 0 or len(h_up) == 0: return None y0, y1 = max(h_down), min(h_up) return (x0, y0, x1, y1) from collections import defaultdict import math box_char_dict = {} for c in characters: bboxes = defaultdict(int) l_x, l_y = c.bbox[0], c.bbox[1] bbox_l = find_bounding_rectangle(l_x, l_y, lines) bboxes[bbox_l] += 1 c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2) bbox_c = find_bounding_rectangle(c_x, c_y, lines) bboxes[bbox_c] += 1 u_x, u_y = c.bbox[2], c.bbox[3] bbox_u = find_bounding_rectangle(u_x, u_y, lines) bboxes[bbox_u] += 1 if max(bboxes.values()) == 1: bbox = bbox_c else: bbox = max(bboxes.items(), key=lambda x: x[1])[0] if bbox is None: continue if bbox in box_char_dict.keys(): box_char_dict[bbox].append(c) continue box_char_dict[bbox] = [c] for x in range(int(xmin), int(xmax), 10): for y in range(int(ymin), int(ymax), 10): bbox = find_bounding_rectangle(x, y, lines) if bbox is None: continue if bbox in box_char_dict.keys(): continue box_char_dict[bbox] = [] def chars_to_string(chars): if not chars: return "" rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True) text = "" for row in rows: sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0]) text = text+' '+"".join(c.get_text() for c in sorted_row) return text def boxes_to_table(box_record_dict): boxes = box_record_dict.keys() rows = sorted(list(set(b[1] for b in boxes)), reverse=True) table = [] for row in rows: sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0]) table.append([chars_to_string(box_record_dict[b]) for b in sorted_row]) return table result = boxes_to_table(box_char_dict) final_result.extend(result) if (final_result): table_df = pd.DataFrame(final_result) all_tables.append(table_df) else: table_without_border() import numpy as np all_table_df = pd.DataFrame([]) for each_table in all_tables: all_table_df = all_table_df.append(each_table,ignore_index=True) all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True) all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True) all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True) all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True) all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True) all_table_df = all_table_df.append(pd.Series([np.nan]), ignore_index=True) try: all_tables = helper_anomaly(len(all_table_df.columns.values)) except: pass desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop') writer = pd.ExcelWriter(desktop + "\output.xlsx", engine='xlsxwriter') all_table_df.to_excel(writer, sheet_name='Sheet1', index=False) writer.save()
def pdf2txt(self): ''' ============================= return : str, text File path ''' # input password = '' pagenos = set() maxpages = 0 # output imagewriter = None rotation = 0 codec = 'UTF-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() infp = open(self.input_path, "rb") if self.output_path == None: self.output_path = self.input_path[:-4] + '_trans.txt' outfp = open(self.output_path, "w", encoding='UTF8') else: outfp = open(self.output_path, "w", encoding='UTF8') #page total num parser = PDFParser(infp) document = PDFDocument(parser) page_total_num = resolve1(document.catalog['Pages'])['Count'] # rsrcmgr = PDFResourceManager(caching=caching) # pdf -> text converter device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # pdf -> text interpreter interpreter = PDFPageInterpreter(rsrcmgr, device) # pdf -> text start with tqdm(total=page_total_num) as pbar: for page in PDFPage.get_pages(infp, pagenos, maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) pbar.update(1) print('[INFO] pdf -> text') outfp.close() infp.close() return self.output_path
def pdf(self, fp, csv_row): password = '' extracted_text = '' self.parser = PDFParser(fp) self.document_t = PDFDocument pf = PdfFileReader # isEncrypted try: i = 0 try: thread = Thread(target=self.load_pdf, args=(PDFDocument, password)) thread.start() thread.join(timeout=90) except Exception as e: print('PDF I/O error: ' + e.__str__()) row = [ self.line_count, 'PDF DOCUMENT OBJECT FAILED TO LOAD - ' + e.__str__() + ': ' + self.url, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ] # self.line_count += 1 report_path = self.report_folder + self.report_name # 90 SECONDS or LOAD FAIL with open(report_path, 'a', encoding='utf8', newline='') as csv_file: writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.dialect.lineterminator.replace('\n', '') writer.writerow(row) stop_event.set() document = PDFDocument document = self.document_t pf = PdfFileReader(BytesIO(open(self.pdf_path, 'rb').read())) # ENCRYPTION if self.parser.doc.encryption is not None: csv_row.insert(4, [self.csv_header[4], 'ENCRYPTED']) csv_row.insert(5, [self.csv_header[5], 'ENCRYPTED']) else: csv_row.insert(4, [self.csv_header[4], 'FALSE']) csv_row.insert(5, [self.csv_header[5], 'NA']) except Exception as e: csv_row.insert(4, [self.csv_header[4], 'FAILED: ' + e.__str__()]) csv_row.insert(5, [self.csv_header[5], 'NA']) exit_call = e.__str__() + ' document failed!!' print(exit_call) pass page_count = 0 # istagged try: pages = PDFPage.get_pages(document) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() page_no = 0 istagged = 'FALSE' try: # document.catalog if document.catalog['MarkInfo']: istagged = 'TRUE' except Exception as e: exit_call = e.__str__() + ' tagged info failed!!' print(exit_call) page_count = resolve1(document.catalog['Pages'])['Count'] csv_row.insert(6, [self.csv_header[6], istagged]) csv_row.insert(7, [self.csv_header[7], page_count]) except Exception as e: csv_row.insert(6, [self.csv_header[6], 'IsTagged: ' + e.__str__()]) csv_row.insert(7, [self.csv_header[7], 'Page Count: ' + e.__str__()]) exit_call = e.__str__() + ' tagged info failed!!' print(exit_call) # TOC try: if pf.outlines: csv_row.insert(8, [self.csv_header[8], 'TRUE']) '''pdf_path_toc = self.document_folder + pdf_name + '_toc.txt' places_list = pf.outlines with open(pdf_path_toc, 'w') as filehandle: filehandle.writelines("%s\n" % place for place in places_list) filehandle.close()''' else: csv_row.insert(8, [self.csv_header[8], 'FALSE']) except Exception as e: csv_row.insert(8, [self.csv_header[8], 'TOC FAILED: ' + e.__str__()]) exit_call = e.__str__() + ' toc info failed!!' print(exit_call) # isForm, fields, try: if pf.getFields(): csv_row.insert(9, [self.csv_header[9], 'TRUE']) csv_row.insert(10, [self.csv_header[10], pf.getFields().__len__()]) else: csv_row.insert(9, [self.csv_header[9], 'FALSE']) csv_row.insert(10, [self.csv_header[10], 0]) except Exception as e: csv_row.insert(9, [self.csv_header[9], 'FORMS: ' + e.__str__()]) csv_row.insert(10, [self.csv_header[10], 'FIELDS: ' + e.__str__()]) exit_call = e.__str__() + ' forms failed!!' print(exit_call) # tables csv_row.insert(11, [self.csv_header[11], 'NOT RUN']) write_clip = '' word_count = 0 words_per_page = 0 char_count = 0 chars_per_word = 0 image_count = 0 # TODO: write 3 page sample and word count try: if pf.getNumPages() < 50: for page in range(pf.getNumPages()): p = pf.getPage(page) text_clip = p.extractText().encode('UTF-8') text_clip = BytesIO(text_clip).read().__str__()[2:] count_clip = re.findall(r"[^\W_]+", text_clip, re.MULTILINE) word_count += len(count_clip) char_count += len(text_clip) if page <= 3: write_clip += '[ PAGE ' + (page + 1).__str__() + ' START ] ' write_clip += text_clip.replace('\n', '').replace( ',', ' ').replace('"', '') write_clip += '[ PAGE ' + (page + 1).__str__() + ' END ]' else: write_clip = 'OVER 50 PAGES - SAMPLE SKIPPED' except Exception as e: exit_call = e.__str__() + ' :: TEXT sample failed!!' write_clip = exit_call word_count = exit_call char_count = exit_call print(exit_call) # TODO: Words/chars per page try: if not word_count == 0: chars_per_word = char_count / word_count else: chars_per_word = 0 if not page_count == 0: words_per_page = word_count / page_count else: words_per_page = 0 except Exception as e: exit_call = e.__str__() + ' :: WORD METRICS failed!!' chars_per_word = exit_call words_per_page = exit_call print(exit_call) # TODO: Add to row i = 12 try: csv_row.insert(i, [self.csv_header[i], word_count.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'WORD_COUNT: ' + e.__str__()]) i = 13 try: csv_row.insert(i, [self.csv_header[i], char_count.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CHAR_COUNT: ' + e.__str__()]) i = 14 try: csv_row.insert(i, [self.csv_header[i], words_per_page.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'WPP: ' + e.__str__()]) i = 15 try: csv_row.insert(i, [self.csv_header[i], chars_per_word.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CPP: ' + e.__str__()]) # TODO: IMAGES i = 16 '''try: pdfImages = Globals.base_folder + 'cli-tools\\pdfimages.exe' img_folder = self.document_folder + 'images\\' # + pdf_name[:-4] + '\\' if not os.path.exists(img_folder): os.makedirs(img_folder) # cmd = pdfImages + ' -list ' + '\"' + pdf_path + '\"' # output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\n') # save images to disk cmd = pdfImages + ' -list \"' + self.pdf_path + '\" \"' + ' ' + '\"' # subprocess.Popen(cmd, stdout=subprocess.PIPE) os.chdir(img_folder) image_list = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\r\n') # os.remove(img_folder) # image_count = output.count('\n') image_count = image_list.__len__() if image_count > 2: # target = open(pdf_path_image, 'w') # target.write(image_list) # target.close() csv_row.insert(i, [self.csv_header[i], (image_count - 2).__str__()]) elif image_count == 0: csv_row.insert(i, [self.csv_header[i], 0]) else: csv_row.insert(i, [self.csv_header[i], 0]) except Exception as e: csv_row.insert(i, [self.csv_header[i], e.__str__() + ' image info failed!!']) exit_call = e.__str__() + ' image info failed!!' print(exit_call)''' # TODO: IMAGES per page i = 17 percent_img_per_page = float try: if not image_count == 0 or page_count == 0: percent_img_per_page = (float(image_count) / float(page_count)) * 100 else: percent_img_per_page = 0 csv_row.insert(i, [self.csv_header[i], percent_img_per_page]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'IMG: ' + e.__str__()]) # TODO: OCR risk i = 18 try: if words_per_page == 0 or percent_img_per_page > 3000: ocr_risk = 5 elif words_per_page < 15 or percent_img_per_page > 2000: ocr_risk = 4 elif words_per_page < 40 or percent_img_per_page > 1000: ocr_risk = 3 elif words_per_page < 70 or percent_img_per_page > 425: ocr_risk = 2 elif words_per_page < 80 or percent_img_per_page > 200: ocr_risk = 1 else: ocr_risk = 0 csv_row.insert(i, [self.csv_header[i], ocr_risk]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'OCR: ' + e.__str__()]) # author, creator, producer, subject, title, di = pf try: di = pf.documentInfo except Exception as e: exit_call = e.__str__() + ' :: DOCUMENT INFO LOAD failed!!' print(exit_call) # Document info if di: # Author try: i = 19 if di.author: csv_row.insert( i, [self.csv_header[i], di.author.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'AUTHOR: ' + e.__str__()]) exit_call = e.__str__() + ' doc info failed!!' print(exit_call) # Creator try: i = 20 if di.creator: csv_row.insert( i, [self.csv_header[i], di.creator.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CREATOR: ' + e.__str__()]) print(exit_call) print('#5.1') # Producer try: i = 21 if di.producer: csv_row.insert( i, [self.csv_header[i], di.producer.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert( i, [self.csv_header[i], 'PRODUCER: ' + e.__str__()]) print(exit_call) # Subject try: i = 22 if di.subject: csv_row.insert( i, [self.csv_header[i], di.subject.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'SUBJECT: ' + e.__str__()]) print(exit_call) # Title try: i = 23 if di.title: csv_row.insert( i, [self.csv_header[i], di.title.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'TITLE: ' + e.__str__()]) print(exit_call) # Document clip i = 24 try: csv_row.insert(i, [self.csv_header[i], write_clip]) except Exception as e: csv_row.insert(i, [self.csv_header[i], e.__str__()]) # Write results row = [] for i in range(csv_row.__len__()): row.append(csv_row[i][1]) report_path = self.report_folder + self.report_name # COPLETE WRITE with open(report_path, 'a', encoding='utf8', newline='') as csv_file: writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.dialect.lineterminator.replace('\n', '') writer.writerow(row) # csv_file.close() fp.close() os.remove(self.pdf_path) # Log close msg = (' >>>> PDF complete:[' + self.url + '] ' + self.line_count.__str__() + ' ' + (datetime.datetime.now().__str__()[:-7])) print(msg) utils.logline(self.log, msg)