def get_language(doc_id, path): language_list = [] count = 0 imagename = '/tmp/temp.png' try: with (Img(filename=path, resolution=300)) as source: time_out = time.time() + 30 while time.time() <= time_out: source.compression_quality = 99 images = source.sequence pages = len(images) for i in range(pages): count += 1 Img(images[i]).save(filename=imagename) text = image_to_string(Image.open(imagename)) if (text.strip()): (language, waight) = langid.classify(text) language_list.append(language) else: language_list.append("*") if count == 6: return statistics.mode(language_list) if count == 0: language_list.append("difficult") print("Difficult document: " + doc_id + " - " + path) return statistics.mode(language_list) except: print("Document Error: " + doc_id + " - " + path) return
def save_pdf_2_png(filename, output_path, resolution=300): all_pages = Img(filename=filename, resolution=resolution) for i, page in enumerate(all_pages.sequence): with Img(page) as img: img.format = 'png' img.background_color = Color('white') img.alpha_channel = 'remove' image_filename = os.path.splitext(os.path.basename(filename))[0] image_filename = '{}_{:0>3}.png'.format(image_filename, i) image_filename = os.path.join(output_path, image_filename) img.save(filename=image_filename) return None
def pdf_to_image(path, resolution=None, **kwargs): img = Img(filename=path, resolution=resolution, **kwargs) img_buffer = np.asarray(bytearray(img.make_blob(format='png')), dtype='uint8') bytesio = io.BytesIO(img_buffer) pil_img = Image.open(bytesio) return pil_img
def convert_pdf_to_png(file_path, file_name): print("Converting pdf to png image for OCR") name_to_save = file_name[0:len(file_name) - 4] with Img(filename=full_path, resolution=300) as img: img.compression_quality = 99 temp_name = file_path + '/' + name_to_save + ".jpg" img.save(filename=temp_name)
def save_img_with_wand(self, pdf_name, output): """ Save pdf with on or more pages into JPG file. Called by self.pdf_to_jpg function :param pdf_name: path to pdf :param output: Filename of temporary jpeg after pdf conversion :return: Boolean to show if all the processes ended well """ try: with Img(filename=pdf_name, resolution=self.resolution) as pic: pic.compression_quality = self.compressionQuality pic.background_color = Color("white") pic.alpha_channel = 'remove' pic.save(filename=output) except wand_except.WandRuntimeError as e: self.Log.error(e) self.Log.error('Exiting program...Fix the issue and restart the service') return False except wand_except.CacheError as e: self.Log.error(e) self.Log.error('Exiting program...Fix the issue and restart the service') return False except wand_except.PolicyError as e: self.Log.error(e) self.Log.error('Maybe you have to check the PDF rights in ImageMagick policy.xml file') self.Log.error('Exiting programm...Fix the issue and restart the service') return False
def pdf2jpg(self, file): with Img(filename=file, resolution=300) as img: img.compression_quality = 99 img.save(filename=os.path.splitext(file)[0] + '.jpg') text1 = self.jpg2text(os.path.splitext(file)[0] + '.jpg') #print("hi") return text1
def convert(self, filename, temp_folder): logging.info('{0} Created'.format(filename)) from wand.image import Image as Img with Img(filename=filename, resolution=300) as img: img.compression_quality = 99 img.save(filename=temp_folder+'/sample_scan.jpg') del Img
def testwand(): with Img(filename='data/poetry.png') as img: # 生成指向图片QingYi.jpg的图片对象img,用with打开图片用完不必关闭 print(img.__dict__) print() print('width =', img.width)
def crop_image_footer(self, img, isTiff=False, lastImage=False, outputName=None): try: if not isTiff: with Img(filename=img, resolution=self.resolution) as pic: pic.compression_quality = self.compressionQuality pic.background_color = Color("white") pic.alpha_channel = 'remove' self.heightRatio = int(pic.height / 3 + pic.height * 0.1) pic.crop(width=pic.width, height=int(pic.height - self.heightRatio), gravity='south') if outputName: pic.save(filename=outputName) if lastImage: pic.save(filename=self.jpgName_last_footer) else: pic.save(filename=self.jpgName_footer) else: if outputName: target = outputName elif lastImage: target = self.tiffName_last else: target = self.tiffName with Img(filename=target, resolution=self.resolution) as pic: pic.compression_quality = self.compressionQuality pic.background_color = Color("white") pic.alpha_channel = 'remove' self.heightRatio = int(pic.height / 3 + pic.height * 0.1) pic.crop(width=pic.width, height=int(pic.height - self.heightRatio), gravity='south') if outputName: pic.save(filename=outputName) elif lastImage: pic.save(filename=self.tiffName_last_footer) else: pic.save(filename=self.tiffName_footer) except (PolicyError, CacheError) as e: self.Log.error('Error during WAND conversion : ' + str(e))
def method_three(): global path_file file = askopenfile(mode='r') if file is not None: try: #cria arquivos das páginas document = file.name page_number = user_page.get() folder = home + '/' + user_folder.get() + '/page_' + page_number os.makedirs(folder, exist_ok=True) #overwrite caso a pasta já exista inputpdf = PdfFileReader(open(document, "rb")) for i in range(1, inputpdf.numPages + 1): output = PdfFileWriter() output.addPage(inputpdf.getPage(i - 1)) pdf_file = folder + '/' + "document-page%s.pdf" % i with open(pdf_file, "wb") as outputStream: output.write(outputStream) #transforma página em imagem, lê como string e salva em csv page_file = folder + '/' + "document-page%s.pdf" % page_number with Img(filename=page_file, resolution=400) as img: img.compression_quality = 99 img_file = page_file.replace('pdf', 'jpg') img.save(filename=img_file) im = Image.open(img_file) text = pytesseract.image_to_string(im, lang='eng') #salva csv file text = text.replace(' ', ',') csv_file = folder + '/' + 'page_{}.csv'.format(page_number) h = open(csv_file, 'w') h.write(text) h.close() #drop pdf files os.system('rm ' + folder + '/*pdf') os.system('rm ' + folder + '/*jpg') message_open = tk.Tk() message_open.geometry("500x100+650+450") message_open.title('Message') style = ThemedStyle(message_open) style.set_theme("arc") msg = tk.Frame(message_open) msg.pack() content = ttk.Label(msg, text="Done!", font=("Helvetica", "50")) content.pack() except: message_open = tk.Tk() message_open.geometry("500x100+650+450") message_open.title('Message') style = ThemedStyle(message_open) style.set_theme("arc") msg = tk.Frame(message_open) msg.pack() content = ttk.Label( msg, text="The tabulation fails. \nPlease, check the log file", font=("Helvetica", "16")) content.pack()
def pdf_to_jpg(path, pdf_name, image_name): try: noPages = get_num_of_pages(path + pdf_name) with Img(filename=path + pdf_name, resolution=300) as img: img.compression_quality = 99 img.save(filename=path + image_name) return noPages except: sys.exit("Failure in processing your file")
def pdfToImage(input_path, output): with Img(filename=input_path, resolution=300) as img: img.background_color = Color('white') # Set white background. img.alpha_channel = 'remove' with img.convert('png') as converted: converted.save(filename=output) return output
def __init__(self, infile, outfile, threshold): with Img(filename=infile, resolution=100) as image: image.compression_quality = 100 image.format = 'png' image.save(filename="temp.png") self.img = Image.open("temp.png").convert("RGBA") self.outfile = outfile self.threshold = threshold self.black = (0, 0, 0) self.data = self.img.getdata()
def pdf_to_img(filename, index): # num = filename.split('/') This commented code # num = num[2] places same file name of the pdf to img # num = num.split('.') # num = num[0] # print("data from split",num) with Img(filename=filename, resolution=300) as img: img.compression_quality = 50 #Below specify the output folder img.save(filename='./new_data/5/{}.jpg'.format(str(index)))
def ocr_pdf_file(pdf_path): # print("\tConverting to images") partitions = pdf_path.split("/") n_parts = len(partitions) file = partitions[n_parts - 1].replace(".pdf", "") img_path = "/".join(partitions[:-1]) + "/" dest_path = img_path + file + ".jpg" clear_dir(img_path) with Img(filename=pdf_path, resolution=300) as img: for i, image in enumerate(img.sequence): img_id = str(i + 1).zfill(4) file_name = dest_path[:-4] + "-" + img_id + '.jpg' Img(image).save(filename=file_name) # time.sleep(1) files = glob.glob(img_path + "*.jpg") # print("\tLoading images") text = "" for file in files: # print("\t\tProcessing image: ", file) img = Image.open(file) img = img.convert('L') img.save(file) # To Use portuguese language it was necessary to download the trained model in: # https://github.com/tesseract-ocr/tessdata_best/blob/master/por.traineddata # And move it to /usr/share/tesseract-ocr/4.00/tessdata/ read = pytesseract.image_to_string(Image.open(file), lang='por', config=tessdata_dir_config) text += read + "\n" text = text_preprocessing.clear_pdf_rtf(text) return text
def converter_pdf(path_source,pdffilename,path_destination,imgfilename,dpi): if "pdf" in pdffilename: filename=path_source+pdffilename image_name=path_destination+imgfilename with Img(filename=filename, resolution=dpi) as img: img.compression_quality = 99 img.save(filename=image_name) return image_name
def save_img_with_wand(self, pdfName, output): try: with Img(filename=pdfName, resolution=self.resolution) as pic: library.MagickResetIterator(pic.wand) pic.scene = 1 # Start cpt of filename at 1 instead of 0 pic.compression_quality = self.compressionQuality pic.background_color = Color("white") pic.alpha_channel = 'remove' pic.save(filename=output) except (PolicyError, CacheError) as e: self.Log.error('Error during WAND conversion : ' + str(e))
def convertirPDFenJPG(self, ruta): """ transforma un pdf en una sucesion de imagenes, una por cada hoja del pdf :param ruta: ruta donde se almacena el pdf :return: imagenes.jpg """ print('*******************************************') with Img(filename= MEDIA_URL+ruta , resolution=150) as img: #img.compression_quality = 99 img.save(filename=IMAGENES_PATH+'image_name.jpg')
def convert_pdf(fichier): """ Permet la conversion d'un fichier pdf en fichier image :param : <str> Nom du fichier pdf :return : <str> Nom du fichier image """ with Img(filename=fichier, resolution=300) as img: img.compression_quality = 99 image = 'image_name.jpg' # cas ou on traite plusieurs fichiers à la suite img.save(filename=image) return image
def processaPDF(): try: with Img(filename='processar.pdf', resolution=300) as img: img.compression_quality = 99 with open('processar.jpg', 'wb') as f: img.save(file=f) text = pytesseract.image_to_string('processar.jpg') os.remove('processar.jpg') os.remove('processar.pdf') except: text = '' return text
def pyTesseract(self): from wand.image import Image as Img # have to install Imagemagick for converting pdf to images. R is much better here, with no additional installtion it can use tesseract seamlessly somehow with Img(filename=filename, resolution=300) as img: img.compression_quality = 99 img.save(filename='image_name.jpg') numPages = len(img.sequence) data = [] for i in range(0, numPages): data.append(pt.image_to_string('image_name-{i}.jpg'.format(i=i))) data = ' '.join(map(str, data)) return (data)
def image_to_text(file_name): text = '' with Img(filename=file_name, resolution=300) as img: img.compression_quality = 99 img.save(filename=file_name[:-4] + '.jpg') total_image_file = [i for i in os.listdir() if i.endswith('.jpg')] for image in total_image_file: text += pytesseract.image_to_string(Image.open(image)) os.remove(image) print(text[0:100]) return text
def converter_pdf(path_source, pdffilename, path_destination, imgfilename, dpi): filename = os.getcwd() + "/" + path_source + pdffilename image_name = os.getcwd() + "/" + path_destination + imgfilename with Img(filename=filename, resolution=dpi) as img: img.alpha_channel = 'remove' #close alpha channel img.background_color = wandimg.Color('white') img.compression_quality = 99 img.save(filename=image_name) return image_name
def cvt_pdf2img(fileName): #Create output image directory outputImage = outputPath + fileName + "/images" if not os.path.exists(outputImage): os.makedirs(outputImage) #Create output OCRtext directory outputOCRtext = outputPath + fileName + "/OCRtext" if not os.path.exists(outputOCRtext): os.makedirs(outputOCRtext) #Convert pdf to image then store in outputImage directory with Img(filename=inputPath + inputFile, resolution=300) as cvt2img: cvt2img.compression_quality = 99 cvt2img.save(filename=outputImage + "/page.jpg")
def pdfToJpegMultipleFiles(inputFile, outputFile): try: with Img(filename=inputFile, resolution=IMAGE_RESOLUTION) as input: if input.format != 'PDF': raise RuntimeError('Input file is not a PDF.') input.compression_quality = COMPRESSION_QUALITY input.save(filename=outputFile) # If the pdf is corrupt / empty, it will attempt to free the image object # when we exit the program, and this fails because the image object is # empty. This also can't be caught because it occurs when the runtime is # exiting, so we'll get an ugly stack trace when we quit. We can't attempt # to del it before exiting because this will fail, so it will again be # automatically called on exit. except DelegateError: raise RuntimeError('PDF file appears corrupted.')
def convertPDFToOCR(path): os.system("rm sample_scan*") text = "" pdfObject = open(path, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfObject) pages = pdfReader.numPages with Img(filename=path, resolution=300) as img: img.compression_quality = 0 img.save(filename="sample_scan.jpg") if pages == 1: text = pytesseract.image_to_string(Image.open('sample_scan.jpg')) else: for i in range(0, pages): temp = pytesseract.image_to_string( Image.open('sample_scan-' + str(i) + '.jpg')) text += temp return text
def apply_ocr(pdf_path_file): path_img = f'../assets/img/TEMPS-IMG.jpg' path_dir_img = '../assets/img/' # Convert pdf to jpg with Img(filename=pdf_path_file, resolution=300) as img: img.compression_quality = 99 img.save(filename=path_img) # Apply OCR on it text_from_pdf = '' for r, d, f in os.walk(path_dir_img): for img in f: if '.jpg' in img: text_file = pytesseract.image_to_string( Image.open(path_dir_img + img)) text_from_pdf += text_file os.remove(path_dir_img + img) return text_from_pdf
def convert_scanned_pdf_to_png(self): """ Resolution = 300 and compression_quality = 99 are optimal to assure the image's quality. :return: file converted to JPG """ img = Img(filename=self.file, resolution=300) ### Opens scanned pdf file img.compression_quality = 99 ### Sets compression quality to 72 ### Reassign self.img_file according to self.file try: self.img_file = self.file.split('/')[-1] self.img_file = self.img_file.replace('.pdf', '.png') self.img_file = os.path.join(ParseScannedPdf.temp_folder, self.img_file) except Exception as e: print 'Be sure to use forward slash when assigning the path of the pdf or img file.' print e.args sys.exit() self.type_pdf = True img.save(filename=self.img_file) ### Converts the file to png time.sleep(2)
def PDFtoJPG(PDFpath, saveAs="[inputfilename].jpg"): # Saves a JPG file from a specified PDF. Saves JPG at specified location and returns path of jpg file as String. # Multipage PDFs will be split up into multiple JPGs and page number appended in the format "-#" (starting at zero, no leading zeros) from wand.image import Image as Img #requires ImageMagik requires ghostscript and paths.Tedious, but best solution found so far. #convert potential paths into strings saveAs = str(saveAs) PDFpath = str(PDFpath) try: with Img(filename=PDFpath, resolution=300) as pdf: pdf.compression_quality = 99 if saveAs == "[inputfilename].jpg": pdf.convert("jpg").save( filename=PDFpath.replace(".pdf", ".jpg")) return PDFpath.replace(".pdf", ".jpg") if not saveAs.endswith(".jpg"): saveAs = saveAs + ".jpg" pdf.convert("jpg").save(filename=saveAs) return saveAs except: print("PDF file for jpg conversion not found at specified location:", PDFpath) return
def parse_contents(contents, filename, date): with Img(filename=filename, resolution = 300) as img: img.compression_quality = 99 img.save(filename='uploaded_file.jpg') demo = Image.open("uploaded_file.jpg") text = pytesseract.image_to_string(demo, lang = 'eng') #print(text) return html.Div([ html.H5(filename), html.H6(datetime.datetime.fromtimestamp(date)), # HTML images accept base64 encoded strings in the same format # that is supplied by the upload html.Img(src=contents), html.Hr(), html.H5("The OCR read words are below"), html.Div(text), html.Div('Raw Content'), html.Pre(contents[0:200] + '...', style={ 'whiteSpace': 'pre-wrap', 'wordBreak': 'break-all' }) ])