def extract_text(self): PDF_file = self.filename out_folder_name = os.path.basename(self.filename) if not os.path.exists(self.image_out_path): os.mkdir(self.image_out_path) if not os.path.exists(os.path.abspath(os.path.join(self.image_out_path,\ out_folder_name))): os.mkdir(os.path.abspath(os.path.join(self.image_out_path,\ out_folder_name))) index = 0 maxPages = pdf2image._page_count(PDF_file) for page in range(0, maxPages, 10): pages = pdf2image.convert_from_path(PDF_file, dpi=200, first_page=page, last_page=min( page + 10 - 1, maxPages)) for tpage in pages: tpage.save( os.path.abspath( os.path.join(self.image_out_path, out_folder_name, str(index) + ".jpg")), 'JPEG') index = index + 1 print("Successfully saved images for each page for {}".format( self.image_out_path)) english_text = list() for filename in sorted(os.listdir( os.path.join(self.image_out_path, out_folder_name)), key=lambda x: int(os.path.splitext(x)[0])): if filename.endswith("jpg"): text = str(((pytesseract.image_to_string( Image.open( os.path.join(self.image_out_path, out_folder_name, filename)))))) text = text.replace('-\n', '') english_text.append(text) corpus = " ".join(english_text) corpus = re.sub(r'\n+', '\n', corpus).strip() corpus = TextBlob(corpus) for sentence in corpus.sentences: self.english.append(sentence.string.replace("\n", " ")) print("English Text Extracted is : {}".format(self.english)) shutil.rmtree(self.image_out_path)
def read_pdf_as_image_to_list(path_file, dpi): # print("images") text_list = [] n_page = _page_count(path_file) for i in range(1, n_page+1): # print(i,text_list) try: pages = pdf2image.convert_from_path(path_file, dpi, first_page = i, last_page = i) for page in pages: text = str(pytesseract.image_to_string(page, lang="spa")) text_list.append(text) except Image.DecompressionBombError: print('image size error') text_list.append('') pass #print(text_list) return text_list
def convertFlat(filename, conditionLower, conditionUpper, batchSize, progressBar, outName="test", pages=0, pageOffset=0, color=np.array([255, 255, 255]), boundingBox=0): ''' Iterate over selected pages of a pdf and change the color of all pixels withing a given range ''' #save info of where to start currentPageCounter = pageOffset #get merger instance for outpdf creation pdfMerger = PdfFileMerger() #if pages weren't set we iterate over all if pages == 0: pages = _page_count(filename) #variable for progress bar totalPages = pages X_Start, Y_Start, X_Len, Y_Len = 0, 0, 0, 0 #prepare coordinates for bounding box if it was created if (len(boundingBox) == 2): X_Start, Y_Start, X_Len, Y_Len = ConverCoordinates(boundingBox) #work until nothing's left while (pages > 0): #use either batch size or w/e is left if (pages > batchSize): pagesToConvert = batchSize else: pagesToConvert = pages #read pages pageBuffer = convert_from_path(filename, fmt='jpeg', first_page=currentPageCounter + 1, last_page=currentPageCounter + pagesToConvert + 1) for case in np.arange(pagesToConvert): #get current page and convert to numpy array im = pageBuffer[case] fullPage = np.array(im) #deal with potential bounding box if (len(boundingBox) == 2): editedPage = fullPage.copy()[Y_Start:Y_Start + Y_Len, X_Start:X_Start + X_Len] else: editedPage = fullPage #reshape for condition check iterOrig = editedPage.reshape( (editedPage.shape[0] * editedPage.shape[1], 3)) #prepare masks maskUpper = np.all(conditionUpper <= iterOrig, axis=1) maskLower = np.all(conditionLower >= iterOrig, axis=1) #update all pixels that met condition mask = np.logical_and(maskUpper, maskLower) iterOrig[mask] = color #restore shape editedPage = iterOrig.reshape( (editedPage.shape[0], editedPage.shape[1], 3)) #deal with potential bounding box if (len(boundingBox) == 2): fullPage[Y_Start:Y_Start + Y_Len, X_Start:X_Start + X_Len] = editedPage #resturn to previous format im = Image.fromarray(fullPage) #ugly code here due to time constraints/other priorities im.save('temp\\temp.jpeg') with open("temp\\tmp.pdf", "wb+") as f: f.write(img2pdf.convert('temp\\temp.jpeg')) with open("temp\\tmp.pdf", "rb") as f: pdfMerger.append(f) #update progress bar progressBar.countChanged.emit( int(((currentPageCounter + 1) / totalPages) * 100)) currentPageCounter += 1 #update loop termination var pages -= pagesToConvert #merge and save finished file with open('result\\' + outName + ".pdf", 'wb') as fout: pdfMerger.write(fout) pdfMerger.close()
def convertAverage(filename, filter, batchSize, progressBar, outName="test", pages=0, pageOffset=0, color=np.array([255, 255, 255]), boundingBox=0): #save info of where to start currentPageCounter = pageOffset #get merger instance for outpdf creation pdfMerger = PdfFileMerger() #if pages weren't set we iterate over all if pages == 0: pages = _page_count(filename) #variable for progress bar totalPages = pages #read average calculated beforehand averaged = np.array(Image.open(filter)) X_Start, Y_Start, X_Len, Y_Len = 0, 0, 0, 0 #prepare coordinates for bounding box if it was created if (len(boundingBox) == 2): X_Start, Y_Start, X_Len, Y_Len = ConverCoordinates(boundingBox) averaged = averaged[Y_Start:Y_Start + Y_Len, X_Start:X_Start + X_Len] #set correct shape averaged = averaged.reshape((averaged.shape[0] * averaged.shape[1], 3)) #work until nothing's left while (pages > 0): #use either batch size or w/e is left if (pages > batchSize): pagesToConvert = batchSize else: pagesToConvert = pages #read pages pageBuffer = convert_from_path(filename, fmt='jpeg', first_page=currentPageCounter + 1, last_page=currentPageCounter + pagesToConvert + 1) for case in np.arange(pagesToConvert): #get current page and convert to numpy array im = pageBuffer[case] fullPage = np.array(im) #deal with potential bounding box if (len(boundingBox) == 2): editedPage = fullPage.copy()[Y_Start:Y_Start + Y_Len, X_Start:X_Start + X_Len] else: editedPage = fullPage #reshape for condition check iterOrig = editedPage.reshape((editedPage.shape[0] * editedPage.shape[1], 3))\ #calculate difference measure and aply to page diff = np.sqrt( np.power(np.sum(np.subtract(iterOrig, averaged), axis=1), 2)) mask = diff < 150 iterOrig[mask] = color #restore shape editedPage = iterOrig.reshape( (editedPage.shape[0], editedPage.shape[1], 3)) #deal with potential bounding box if (len(boundingBox) == 2): fullPage[Y_Start:Y_Start + Y_Len, X_Start:X_Start + X_Len] = editedPage im = Image.fromarray(fullPage) #ugly code here due to time constraints/other priorities im.save('temp\\temp.jpeg') with open("temp\\tmp.pdf", "wb+") as f: f.write(img2pdf.convert('temp\\temp.jpeg')) with open("temp\\tmp.pdf", "rb") as f: pdfMerger.append(f) #update progress bar progressBar.countChanged.emit( int(((currentPageCounter + 1) / totalPages) * 100)) currentPageCounter += 1 #update loop termination var pages -= pagesToConvert #merge and save finished file with open('result\\' + outName + ".pdf", 'wb') as fout: pdfMerger.write(fout) pdfMerger.close()
def getAverageEstimate(filename, batchSize, progressBar, outName="test", pages=0, pageOffset=0): ''' Create estimated watermark by averaging over all pages in the document ''' #save info of where to start currentPageCounter = pageOffset #if pages weren't set we iterate over all if pages == 0: pages = _page_count(filename) totalPages = pages #get zeroed out array for average calculation firstPage = convert_from_path(filename, fmt='jpeg', first_page=1, last_page=1)[0] averaged = np.zeros_like(np.array(firstPage)).astype('uint64') #address batch size while (pages > 0): if (pages > batchSize): pagesToConvert = batchSize else: pagesToConvert = pages #read a few pages to buffer pageBuffer = convert_from_path(filename, fmt='jpeg', first_page=currentPageCounter + 1, last_page=currentPageCounter + pagesToConvert + 1) for case in np.arange(pagesToConvert): #get current page and convert to numpy array im = pageBuffer[case] fullPage = np.array(im) #add current page averaged = np.add(averaged, fullPage) #update progress bar progressBar.countChanged.emit( int(((currentPageCounter + 1) / totalPages) * 100)) currentPageCounter += 1 #update loop termination var pages -= pagesToConvert #calculate average and save it averaged = averaged / (currentPageCounter + 1) im = Image.fromarray(averaged.astype('uint8')) im.save("estimated_watermarks\\" + outName + '.jpeg') return