def find_word_in_image(): print('Enter the image path : ') img = input() print('Enter the word to search : ') key = input() pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' img = cv2.imread(img) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) print('Checking...') pytesseract.image_to_pdf_or_hocr(img, extension='pdf') boxes = pytesseract.image_to_data(img) f = 0 for a, b in enumerate(boxes.splitlines()): if a != 0: b = b.split() if len(b) == 12 and srch(b[11].lower(), key.lower()): x, y, w, h = int(b[6]), int(b[7]), int(b[8]), int(b[9]) cv2.putText(img, b[11], (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 1, (50, 50, 255), 2) cv2.rectangle(img, (x, y), (x + w, y + h), (50, 50, 255), 2) f = 1 if f == 0: print('Word not found') else: print('Word found !') size = (1000, 800) cv2.imshow('img', cv2.resize(img, size)) cv2.waitKey(0)
def tesseract(): # If you don't have tesseract executable in your PATH, include the following: #pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_exec$ # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' # Simple image to string #.encode('utf8') #add to print statement to make encoding work print(pytesseract.image_to_string(Image.open(imgFile)).encode('utf8')) # Get bounding box estimates print(pytesseract.image_to_boxes(Image.open(imgFile))) # Get verbose data including boxes, confidences, line and page numbers print(pytesseract.image_to_data(Image.open(imgFile))) # Get information about orientation and script detection print(pytesseract.image_to_osd(Image.open(imgFile))) # In order to bypass the internal image conversions, just use relative or ab$ # NOTE: If you don't use supported images, tesseract will return error print(pytesseract.image_to_string(imgFile)) # get a searchable PDF pdf = pytesseract.image_to_pdf_or_hocr(imgFile, extension='pdf') # get HOCR output hocr = pytesseract.image_to_pdf_or_hocr(imgFile, extension='hocr')
def tesseract_pdf(self, ops): if ops is 0: # get a searchable PDF pdf = pytesseract.image_to_pdf_or_hocr('Placa.jpg', extension='pdf') else: # get HOCR output hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')
def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: bool = False, custom_config: str = None): """ :param file: :param lang: available: deu, eng :return: """ filepath = "temp/" + file.filename with file.file: with open(filepath, "wb") as temp_file: temp_file.write(file.file.read()) # preprocess_image(filepath) if custom_config is None: custom_config = '--oem 3' if text_only: output = bytes(pytesseract.image_to_string(filepath, lang=lang, config=custom_config), encoding="utf-8") response = PlainTextResponse(content=output) else: output = pytesseract.image_to_pdf_or_hocr(filepath, lang=lang, extension='hocr', config=custom_config) extracted = xmltodict.parse(output) response = hocr_to_simple_json(extracted, lang) os.remove(filepath) return response
def to_text(filename, lang="eng", format_="txt", ignore_error=False): """ Extract text from an image using Tesseract OCR. Arguments: filename {str} -- path to an image Keyword Arguments: lang {str} -- Tesseract OCR language option (default: "eng") format_ {str} -- Tesseract output format (.hocr or .txt, default: "txt") ignore_error {bool} -- catch and ignore all exceptions (default: False) """ try: im = Image.open(filename) basename, ext = os.path.splitext(filename) target = basename + "." + format_ if format_ == "txt": tess_output = pytesseract.image_to_string(im, lang=lang, config="--psm 3 --oem 1") elif format_ == "hocr": tess_output = pytesseract.image_to_pdf_or_hocr( im, lang=lang, config="--psm 3 --oem 1", extension=format_ ) with open(target, "w") as fp: fp.write(str(tess_output)) except Exception as e: if ignore_error: print("Error: {}".format(e)) else: raise e
def _extract(self): path = self.paths[self.pathidx] self.logger.clear() self.logger.log("Extracting information from '{}'...\n".format(path)) temp = None if path.split('.')[-1].lower() in ['jpg', 'png']: image = Image.open(path) pdf = pytesseract.image_to_pdf_or_hocr(image, extension='pdf') temp = tempfile.NamedTemporaryFile(suffix='.pdf') temp.write(pdf) path = temp.name predictions = {} for key in FIELDS: if self.checkboxes[key].get(): model = AttendCopyParse(field=key, restore=True) predictions[key] = model.predict(paths=[path])[0] if temp is not None: temp.close() self.viewer.label(labels=predictions) self.logger.log(simplejson.dumps(predictions, indent=2, sort_keys=True)) self.start_button.configure(state='normal') self.running = False
def _load_file(self): self.viewer.clear() path = self.paths[self.pathidx] filename = os.path.basename(path) try: if filename.split('.')[-1].lower() in ['jpg', 'png']: image = Image.open(path) pdf = io.BytesIO( pytesseract.image_to_pdf_or_hocr(image, extension='pdf')) self.pdf = pdfplumber.load(pdf) else: self.pdf = pdfplumber.open(path) self.viewer.display_pdf(self.pdf) self.doc_label.configure( text="{} of {}".format(self.pathidx + 1, len(self.paths))) self.logger.clear() self.logger.log("Showing invoice '{}'".format(path)) except WandException: result = messagebox.askokcancel( "Error", "ImageMagick Policy Error! Should InvoiceNet try to fix the error?" ) if result: result = self._fix_policy_error() if result: messagebox.showinfo( "Policy Fixed!", "ImageMagick Policy Error fixed! Restart InvoiceNet.") else: messagebox.showerror( "ImageMagick Policy Error", "Coud not fix ImageMagick policy. Rejecting the current pdf file!" ) except (IndexError, IOError, TypeError): pass
def run_ocr(f): (filename, extension) = splitext(f) # Skips non-image files if extension == '': return print('Starts an OCR on', f) try: img = Image.open(join(img_path, f)) # Simple image to string if GEN_TXT: content = pytesseract.image_to_string(img, lang=language) txt = join(txt_path, filename + '.txt') with open(txt, 'w') as tf: tf.write(content) print('Successfully wrote the extracted text in', txt) # Get a searchable PDF if GEN_PDF: content = pytesseract.image_to_pdf_or_hocr(img, lang=language, extension='pdf') pdf = join(pdf_path, filename + '.pdf') with open(pdf, 'w+b') as pf: pf.write(content) # pdf type is bytes by default print('Successfully wrote the converted pdf in', pdf) except Exception as e: print("Fail!", e.__class__, "occurred:") print(e) print()
def createPdfFromImages(self, base_source_folder, parent_folder, output_folder): absolute_parent_folder_path = os.path.join(base_source_folder, parent_folder) images = FileHelper.getAllFilesInFolder(absolute_parent_folder_path) pdf_path = os.path.join(output_folder, parent_folder) # + ".pdf" for image in images: # Read image from disk absolute_image_path = os.path.join(absolute_parent_folder_path, image) im = cv2.imread(absolute_image_path, cv2.IMREAD_COLOR) #define tesseract config config = ('-l eng --oem 1 --psm 3') # Run tesseract OCR on image #text = pytesseract.image_to_string(im, config=config) # create PDF from tesseract OCR results pdf = pytesseract.image_to_pdf_or_hocr(im, extension='pdf') #f = open("data\\images\\eu-009-page-001.pdf", "w+b") f = open( os.path.join(output_folder, os.path.splitext(image)[0]) + ".pdf", "w+b") f.write(bytearray(pdf)) f.close()
def ocr(inPath, copyPath): outPath = 'ocr/output/receipt.jpg' pdfPath = 'ocr/output/output.pdf' process.transform(inPath, outPath) # load the example image and convert it to grayscale img = cv2.imread(outPath) # write the grayscale image to disk as a temporary file so we can # apply OCR to it # psm 4: Assume a single column of text of variable sizes. text = pytesseract.image_to_string(img, lang='eng', config='--psm 4 --oem 1') # os.remove(filename) # print(text) result = textAnalysis(text, copyPath) # Generate a PDF pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf') try: f = open(pdfPath, 'w') f.write(pdf) f.close() except IOError, e: print 'IOError: ', e
def parse_PDF_by_Tesseract(self, pdf_file): """ To use Tesseract, PDF first needs to be converted into image files. 'pdf2image' is based on 'Poppler' package. list of languages: https://github.com/tesseract-ocr/langdata - Adding a parameter <lang='eng'> doesn't improve the performance (speed). It's because default lang is English. - dpi=600 takes more processing time than dpi=300, but no performance improvement was found. E.g., pytesseract.image_to_string(image, lang='eng', boxes=False, config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789') Ref - https://www.pyimagesearch.com/2018/09/17/opencv-ocr-and-text-recognition-with-tesseract/ """ images = convert_from_path(pdf_file, dpi=300, transparent=True, fmt='tiff', thread_count=3) # TODO: two give the same result, but test it with further examples. (when using encode() change 'w' to 'wb') #text = '\n'.join(pytesseract.image_to_string(img) for img in images) #text = '\n'.join(pytesseract.image_to_string(img) for img in images).encode('utf-8') #with open('tesseract_test.txt', 'w') as file: # file.write(text) hocr = pytesseract.image_to_pdf_or_hocr(images[5], extension='hocr') with open('tesseract_test.html', 'wb') as file: file.write(hocr)
def convert_To_PDF(input_files): # Creating pdffilemerger object pdfMerger = PyPDF2.PdfFileMerger() for file in input_files: # Creating image object image = cv2.imread(file) # Converting the image to ocr pdf result = pytesseract.image_to_pdf_or_hocr(image, lang='eng') # Writing the ocr converted result to a temp.pdf file with open("temp.pdf", 'wb') as current_page_file: current_page_file.write(bytearray(result)) # Appending the temp.pdf file to pdfmerger object pdfMerger.append(PyPDF2.PdfFileReader('temp.pdf'), 'rb') # Deleting temp.pdf file os.remove('temp.pdf') # Saving merged pdf object as pdf file output_file_location = "output/my_output.pdf" output_file = open(output_file_location, 'wb') pdfMerger.write(output_file) output_file.close() pdfMerger.close() return os.path.abspath(output_file_location)
def get_text_from_image_pdf(folder, file): output_pdf_file = os.path.join(folder, "text_output.pdf") # Convert the pdf to a png image pdf2image.convert_from_path(os.path.join(folder, file), fmt="png", dpi=400, single_file=True, output_folder=args.folder, output_file="tmp") # Generate a pdf with selectable text based on the image pdf = pytesseract.image_to_pdf_or_hocr(os.path.join(folder, "tmp.png"), extension="pdf") with open(output_pdf_file, "w+b") as f: f.write(pdf) # Extract the text from the pdf with open(output_pdf_file, "rb") as in_pdf: pdf = pdftotext.PDF(in_pdf) # Check that we've actually got some text out from the converted file text = pdf[0] if len(text) == 0: Exception("Could not extract text from converted pdf file") return text
def extract_data_ocr(image): # extract text text = pytesseract.image_to_string(image) # extract pdf hocr = pytesseract.image_to_pdf_or_hocr(image, extension='hocr') # done return text, hocr
async def handler(event): message_text = None text_in_image = None if event.message.media is not None: if is_image(event.message.media): await client.download_media(event.message.media, config["temp_path"] + 'temp.jpg') time.sleep(2) # Get HOCR output hocr = pytesseract.image_to_pdf_or_hocr(config["temp_path"] + 'temp.jpg', extension='hocr') soup = BeautifulSoup(hocr.decode('utf-8'), 'html.parser') elements = soup.find_all("span", class_="ocrx_word") text = '' for elm in elements: text += elm.text text_in_image = re.findall(r'[A-Z]{3}\s*/\s*[A-Z]{3}', text) if len(text_in_image) > 0: text_in_image = "Symbol: " + text_in_image[0].replace( '/', '').replace(" ", "") message_from_sender = parese_message(event.message.message) if message_from_sender is not None and text_in_image is not None: message_text = text_in_image + "\n" + message_from_sender elif text_in_image is None: message_text = message_from_sender elif message_from_sender is None: message_text = text_in_image await client.send_message(output_channel_entity, message_text)
def result_coords(im): string = '' hocr = pytesseract.image_to_pdf_or_hocr(im, extension='hocr') with open('tanayhocr.txt', "w+b") as f: f.write(bytearray(hocr)) with open('tanayhocr.txt', 'r') as f: string += f.read() keywords = ['Result', 'RESULT', 'Value', 'VALUE'] for word in keywords: result = 0 result = string.find('>' + word, 0, len(string)) # print(word, result) if result == -1: continue break i = 0 for x in range(result, 0, -1): if string[x] == 'x': i += 1 if i == 2: break ans = string[x + 2:result].split(';')[0] ans = ans.split(' ') return int(ans[0]), int(ans[1]), int(ans[2]), int(ans[3])
def _run_ocr(self): if self.pdf is None: return pdf_pages = list() for page in self.pdf.pages: image = page.to_image(resolution=100) pdf = pytesseract.image_to_pdf_or_hocr(image.original, extension='pdf') pdf_pages.append(pdf) pdf_writer = PyPDF2.PdfFileWriter() for page in pdf_pages: pdf = PyPDF2.PdfFileReader(io.BytesIO(page)) pdf_writer.addPage(pdf.getPage(0)) dirname = os.path.dirname(self.paths[self.pathidx]) filename = os.path.basename(self.paths[self.pathidx]) path = filedialog.asksaveasfilename(title='Save OCR As', defaultextension='.pdf', initialdir=dirname, initialfile=filename, filetypes=[('PDF files', '*.pdf'), ('all files', '.*')]) if path == '' or path is None: return with open(path, 'wb') as out: pdf_writer.write(out) self.paths[self.pathidx] = path self._load_file()
def _append_pdf_page(self, image): # Convert screen cap into correct color space img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Generate OCR pdf from image pdf = pytesseract.image_to_pdf_or_hocr( img_rgb, lang="eng", extension="pdf", nice=0 ) self.pdf_writer.appendPagesFromReader(PyPDF2.PdfFileReader(io.BytesIO(pdf)))
def pdfGenerator(self, imagePath, name, extension, pdfPath): try: pdf = pytesseract.image_to_pdf_or_hocr(imagePath+name+extension, extension='pdf') f = open(pdfPath+name+".pdf", "w+b") f.write(bytearray(pdf)) f.close() except Exception as e: print(e)
def get_searchable_pdf(pdf_file_path): # Todo: Check implementation images = convert_pdf_to_imgs(pdf_file_path) with open('test.pdf', 'w+b') as f: for pg, img in enumerate(images): pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf') f.write(pdf) # pdf type is bytes by default return pdf
def save_pdf_from_image(path: str, dst_folder: str): """ saves pdf from image to dst specified by user """ image_name = path.split("\\")[-1].split(".")[0] binary_pdf = pytesseract.image_to_pdf_or_hocr(Image.open(path)) with open(dst_folder + "\\{}.pdf".format(image_name), "wb") as bin_file: bin_file.truncate(0) bin_file.write(binary_pdf)
def make_searchable_pdf(input_path, output_file_name=None): """ This function takes in the file path to a document and return a searchable pdf using Tesseract through the pytesseract module *WARNING* Once a file is passed through this method while output_file=None, it will be overwritten with the Tesseract searchable PDF. Example to avoid creating files: # External PDF called test.pdf make_searchable(path/to/test.pdf, 'output_name') :param input_path: the filepath to be converted :param output_file_name: the file name that you wish to save the document under (without extensions) :return: output_file path """ # Keeps track of output filepath, and creates an empty list for creating dummy filepaths if isinstance(input_path, str): input_path = Path(input_path) if output_file_name is None: output_file = input_path output_file_name = output_file.stem else: output_file = Path(input_path.parent, output_file_name + '.pdf') file_paths = [] # Converts pdf into a list of PIL files image = pdf2image.convert_from_path(input_path, fmt='jpg') # Converts the PIL files into binaries and saves them in a list, along with the filepaths pages = [] for i in range(len(image)): single_page = pytesseract.image_to_pdf_or_hocr(image[i], extension='pdf') pages.append(single_page) file_paths.append( Path(output_file.parent, output_file_name + '_' + str(i) + '.pdf')) # Creates dummy pdf documents that will be merged for i, page in enumerate(pages): with open(file_paths[i], 'wb') as f: f.write(page) # Merges the pdf files in python merger = PyPDF2.PdfFileMerger() for path in file_paths: merger.append(str(path)) # Writes the merged file into one document and then deletes dummy files merger.write(str(output_file)) for path in file_paths: path.unlink() return output_file
def convert_images_to_pdf(self, images, output_dir): pdfs = [] for page_number, image in enumerate(images): pdf = pytesseract.image_to_pdf_or_hocr(Image.open(image)) filename = output_dir + "/page_" + str(page_number) + ".pdf" with open(filename, "wb") as file: file.write(pdf) pdfs.append(filename) return pdfs
def _getpageobj(fp): np = toBinaryImage(fp) hocr_str = pytesseract.image_to_pdf_or_hocr(Image.open(np), extension="hocr", lang="fas") return { "data_url": img_to_data(np), "hocr": str(hocr_str, encoding="utf-8") }
def imgtoPDF(self, path): # CORRIGIR QUANDO NÃO SELECIONA IMAGEM path = self.clickSelectfile() img = cv2.imread(path) img = cv2.cvtColor( img, cv2.COLOR_BGR2RGB ) # need to encode the image to RGB, it is how opencv works pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf') with open('test.pdf', 'w+b') as f: f.write(pdf)
def ocr(i): pdf = pytesseract.image_to_pdf_or_hocr(tempfolder + i, extension='pdf') count.inc() with open(tempfolder + i[-9:-4] + '.pdf', 'w+b') as f: f.write(pdf) try: k = progress.index(count.val) print(f"Finished {int(count/total*100)}%: {count.val} pages") except: pass
def generate_simple_pdf(self, image, filename): PDF = pytesseract.image_to_pdf_or_hocr(image, lang='eng', config='', nice=0, extension='pdf') f = open("processeddata" + filename + "_simple.pdf", "w+b") f.write(bytearray(PDF)) f.close() return
def generate_html(self, image, filename): content = pytesseract.image_to_pdf_or_hocr(image, lang='eng', nice=0, extension='hocr') # Write content to a new file, owerwrite w or append a (b=binary) f = open("processeddata" + filename + ".html", 'w+b') f.write(bytearray(content)) f.close() return
def get_ocr_pdf(): """ OCR image to pdf --- tags: - ocr parameters: - in: formData name: image type: file required: true description: Image file - in: formData name: lang type: string required: true description: Language - in: formData name: timeout type: int required: false description: Timeout responses: 500: description: Error message 200: description: Pdf file """ if 'image' not in request.files: return "No file uploaded" image_file = request.files['image'] if image_file.filename == '': return "No file name" if request.form['lang'] == '': return "No language defined" file = NamedTemporaryFile() image_file.save(file.name) timeout = 300 if "timeout" in request.form and int(request.form['timeout']) > 0: timeout = int(request.form['timeout']) pdf = pytesseract.image_to_pdf_or_hocr(file.name, lang=request.form['lang'], config='', nice=0, extension='pdf', timeout=timeout) return pdf
def __init__(self, invoiceData, destinationPath, templateFolerPath, pdfDummyPath): #consturctor self.__writeExcelFlagg = 1 # this is for user interaction in upcomming versions self.__destinationfile = destinationPath self.__invoiceData = invoiceData # default excel dictionary field_dic = {'Lieferdatum' : 'date_service', 'Bestelldatum' : 'date_order', 'Zulieferer' : 'issuer', 'Rechnungsnummer' : 'invoice_number', 'Betrag' : 'amount_sum', 'Netto' : 'amount_net', 'Template' : 'template'} templatefolder = templateFolerPath pdfDummy = pdfDummyPath for row, info in self.__invoiceData.iterrows(): if info['valid'] != -1: # we have a vaild datatype if info['typ'] != 'pdf': # if ivvoice is not pdf: create pdf logging.debug('egine_main: invoice is not pdf') x = image_to_pdf_or_hocr(info.path,lang='deu') f = open(pdfDummy, "w+b") f.write(bytearray(x)) f.close() pdf = pdfDummy else: # invoice is pdf logging.debug('egine_main: invoice is pdf') pdf = info.path logging.debug('egine_main: starting extract_mydata_pdf with pdf: '+pdf) text = extract_mydata_pdf(pdf) # get data from created pdf logging.debug('egine_main: extract_mydata_pdf completed') if text == None: logging.debug('egine_main: extracted text is empty') logging.debug('egine_main: starting get_invoicedata') data = get_invoicedata(text,templatefolder,field_dic) #get data from extracted text if data != None: logging.debug('egine_main: data != None') else: logging.debug('egine_main: data == None') logging.debug('egine_main: get_invoicedata completed') #pprint.pprint(data) # do the xlsx wirting handle if self.__writeExcelFlagg == 1 and data != None: logging.debug('egine_main: starting xlsWriting') df = read_excel(self.__destinationfile) logging.debug('egine_main: reading completed') df = add_todataframe(df,field_dic,data) logging.debug('egine_main: adding data completed') df.to_excel(self.__destinationfile,index=False) logging.debug('egine_main: xlsWriting completed') else: print(info['typ']+' is not supported')