def __langoptions__(self): tesseract_list = pytesseract.get_languages(config='') supported_langs = [] for k,v in lang_options.items(): if v in tesseract_list: supported_langs.append(k) return sorted(supported_langs)
def main(inputfile, args, lang = 'eng', width = 2000): if lang == None: lang = 'eng' # Default to language to 'eng' if args.scale: width = int(args.scale) # Set width scale to if args.verbose: print('File:', inputfile) print('Starting ocr with lang:', lang) if args.get_lang: print('Languages:') print(', '.join(pytesseract.get_languages())) # Load image image = cv2.imread(inputfile) # Downscale image with interpolation (h, w) = image.shape[:2] r = width / float(w) dim = (int(width), int(h * r)) image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if args.verbose: print("Downscaled to ", dim) # Perform OCR result = pytesseract.image_to_string(image, args.lang).strip() # Remove all special chars (--remove-special) if args.remove_special: if args.verbose: print("Removing special chars") for c in SPECIAL_CHARS: result = result.replace(c, '') # Output result (--verbose) or (--result) if args.verbose and args.result: print('OCR Result:', result) elif args.result: print(result) # Write result to file (--out) if args.out: outFileName = inputfile+'.txt' if args.verbose: print("Outputting to", outFileName) with open(outFileName, 'w') as fo: fo.write(result)
def fetch_text_in_image(self, img): # List of available languages print(pytesseract.get_languages(config='')) # NOTE: You can just load the image if it is one of the supported formats like: # text = pytesseract.image_to_string(img) # Load image with OpenCV img_cv = cv2.imread(img) # pytesseract needs images in RGB, opencv uses BGR - convert it! img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB) # Other data sources available from https://github.com/tesseract-ocr/tessdata # tessdata_dir_config = r'--tessdata-dir "/tmp/screenshots" --psm 11' # text = pytesseract.image_to_string(img_rgb, lang='eng', config=tessdata_dir_config) # Inform the OCR engine that the "page" is not a document of text and process tessdata_dir_config = r'--psm 11' text = pytesseract.image_to_string(img_rgb, lang='eng', config=tessdata_dir_config) return text
def get_tesseract_languages(tessdata_path) -> list[str]: """Get info abput tesseract setup.""" configure_tesseract_binary() try: languages = sorted( pytesseract.get_languages( config=get_tesseract_config(tessdata_path))) except RuntimeError as e: traceback.print_tb(e.__traceback__) raise RuntimeError( "Couldn't determine Tesseract information. If you pip installed NormCap " + "make sure Tesseract is installed and configured correctly." ) from e if not languages: raise ValueError( "Could not load any languages for tesseract. " + "On Windows, make sure that TESSDATA_PREFIX environment variable is set. " + "On Linux/MacOS see if 'tesseract --list-langs' work is the command line." ) return languages
try: from PIL import Image except ImportError: import Image import pytesseract # If you don't have tesseract executable in your PATH, include the following: pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>' # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' # Simple image to string print(pytesseract.image_to_string(Image.open('test.png'))) # List of available languages print(pytesseract.get_languages(config='')) # French text image to string print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra')) # In order to bypass the image conversions of pytesseract, just use relative or absolute image path # NOTE: In this case you should provide tesseract supported images or tesseract will return error print(pytesseract.image_to_string('test.png')) # Batch processing with a single file containing the list of multiple image file paths print(pytesseract.image_to_string('images.txt')) # Timeout/terminate the tesseract job after a period of time try: print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 seconds print(pytesseract.image_to_string(
################ # SETUP ################ # create config file if not already present if not configPath.exists(): print("No config found") writeConfig() readConfig() # pytesseract version print("[INFO] currently using tesseract: " + str(pytesseract.get_tesseract_version())) # other languages can be installed by # sudo apt install tesseract-ocr-[language code] langs = pytesseract.get_languages(config="") print(f"[INFO] following languages are availible:\n {langs}") # get a numerical sorted list of all files and the total count numbers = re.compile(r"(\d+)") # matches numerical token with multiple digits files = sorted(listdir(path=src_image_path), key=numericalSort) totalNumOfImages = len(files) if enable_debug: print(f'[INFO] {totalNumOfImages} image files are present in "{src_image_path}"') print(f"[INFO] Following files are present:\n {files}") ################ # LOOP ################ # CHECK IF OCR CODE IS INCLUDED IN INSTALLED LANG's. NEEDS TO BE PRESENT FOR OCR!
def print_lang_codes(): langs = pytesseract.get_languages(config='') for l in langs: print(f"'{l}': ,")
# Text recognition from images using Pytesseract lib import pytesseract pytesseract.pytesseract.tesseract_cmd = r'D:\installations\pytesseract\inst\Tesseract-OCR\tesseract.exe' import cv2 from PIL import Image import tensorflow as tf # Read and display image image = cv2.imread('sample3.jpg') img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Some functions of tesseract print(pytesseract.image_to_string(img_rgb)) # Prints out the text print(pytesseract.get_languages(config='')) # List of available languages #--------- Prints out the letters and corresponding coordinates and create BB around them ------ print(pytesseract.image_to_boxes( img_rgb)) # Prints out the letters and corresponding coordinates bounding_box_coordinates = pytesseract.image_to_boxes(img_rgb) image_h, image_w, _ = img_rgb.shape for bounding_box in bounding_box_coordinates.splitlines(): print(type(bounding_box)) # string class print(bounding_box) bounding_box = bounding_box.split(' ') x1, y1, x2, y2 = int(bounding_box[1]), int(bounding_box[2]), int( bounding_box[3]), int(bounding_box[4]) cv2.rectangle(img_rgb, (x1, image_h - y1), (x2, image_h - y2), (0, 0, 255), thickness=3)
def get_available_languages() -> set[str]: try: # Will throw an exception if tesseract_cmd is invalid return set(tess.get_languages()) except EnvironmentError: return AVAILABLE_LANGUAGES
} </style> """, unsafe_allow_html=True) st.markdown(f""" <div class="container"> <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}"> <p class="logo-text">GrumpyOCR</p> </div> """, unsafe_allow_html=True) tesseract_version = pytesseract.get_tesseract_version() tesseract_languages = pytesseract.get_languages(config='') st.markdown(f"Version: {tesseract_version}, Langs: {tesseract_languages}") st.sidebar.markdown('### Настройки') languages = ['eng+rus', 'rus', 'eng'] selected_languages = st.sidebar.selectbox('Язык', languages) # Выбор файла с признаками обязателен для дальнейшего выполнения. st.markdown("### 1 Выберите или перенесите файл с текстом") upload_file_object = st.file_uploader('', ['pdf', 'png', 'jpg', 'jpeg']) if not upload_file_object: st.stop() # Небольшой трюк с кэшированием в файл. filename = upload_file_object.name if not upload_file_object.closed: