def can_run_ocr(): """Check if pytessaract can communicate with tessaract.""" try: pytesseract.get_tesseract_version() return True except pytesseract.TesseractNotFoundError: return False
def ensure_ocr_enabled(): try: pytesseract.get_tesseract_version() except pytesseract.TesseractNotFoundError(): raise HTTPException( status_code=400, detail="OCR(with Tesseract) is not enabled/installed on the server", )
def fullWindow(self): self.setWindowIcon(QIcon('icon.png')) self.setWindowTitle('Capture') flags = Qt.WindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint | Qt.Dialog) self.setWindowFlags(flags) self.showMaximized() self.setStyleSheet( "background-color: rgba(255,255,255,0.0); border: 3px solid rgb(16, 229, 125);" ) self.show() try: pytesseract.get_tesseract_version() except: self.showTesseractError()
def __init__(self, editor, software_version): self.logger = get_logger("gui") self.editor = editor self.captured_map_coords = None self.profile = self.editor.get_profile('') self.profile.aircraft = "hornet" self.exit_quick_capture = False self.values = None self.capturing = False self.capture_key = self.editor.settings.get("PREFERENCES", "capture_key") self.software_version = software_version tesseract_path = self.editor.settings['PREFERENCES'].get( 'tesseract_path', "tesseract") self.logger.info(f"Tesseract path is: {tesseract_path}") pytesseract.pytesseract.tesseract_cmd = tesseract_path try: self.tesseract_version = pytesseract.get_tesseract_version() self.capture_status = "Status: Not capturing" self.capture_button_disabled = False except pytesseract.pytesseract.TesseractNotFoundError: self.tesseract_version = None self.capture_status = "Status: Tesseract not found" self.capture_button_disabled = True self.logger.info(f"Tesseract version is: {self.tesseract_version}") self.window = self.create_gui()
def __init__(self) -> None: super().__init__() self.flagSem = Semaphore(1) pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract' try: print(pytesseract.get_tesseract_version()) except pytesseract.pytesseract.TesseractNotFoundError: messagebox.showinfo(title="tesseract 에러", message="tesseract OCR 프로그램을 감지 할 수 없습니다.") exit(0)
def __init__(self, editor, software_version): self.logger = get_logger("gui") self.editor = editor self.captured_map_coords = None self.profile = Profile('') self.profile.aircraft = "hornet" self.exit_quick_capture = False self.values = None self.capturing = False self.capture_key = try_get_setting(self.editor.settings, "capture_key", "ctrl+t") self.quick_capture_hotkey = try_get_setting(self.editor.settings, "quick_capture_hotkey", "ctrl+alt+t") self.enter_aircraft_hotkey = try_get_setting(self.editor.settings, "enter_aircraft_hotkey", "ctrl+shift+t") self.software_version = software_version self.is_focused = True self.scaled_dcs_gui = False self.selected_wp_type = "WP" try: with open( f"{self.editor.settings.get('PREFERENCES', 'dcs_path')}\\Config\\options.lua", "r") as f: dcs_settings = lua.decode(f.read().replace("options = ", "")) self.scaled_dcs_gui = dcs_settings["graphics"]["scaleGui"] except (FileNotFoundError, ValueError, TypeError): self.logger.error("Failed to decode DCS settings", exc_info=True) tesseract_path = self.editor.settings['PREFERENCES'].get( 'tesseract_path', "tesseract") self.logger.info(f"Tesseract path is: {tesseract_path}") pytesseract.pytesseract.tesseract_cmd = tesseract_path try: self.tesseract_version = pytesseract.get_tesseract_version() self.capture_status = "Status: Not capturing" self.capture_button_disabled = False except pytesseract.pytesseract.TesseractNotFoundError: self.tesseract_version = None self.capture_status = "Status: Tesseract not found" self.capture_button_disabled = True self.logger.info(f"Tesseract version is: {self.tesseract_version}") self.window = self.create_gui() keyboard.add_hotkey(self.quick_capture_hotkey, self.toggle_quick_capture) if self.enter_aircraft_hotkey != '': keyboard.add_hotkey(self.enter_aircraft_hotkey, self.enter_coords_to_aircraft)
def ocr(filename, genre): pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' print(pytesseract.get_tesseract_version()) image_files = os.listdir(os.path.join(target_cleaned, filename)) image_files.sort(key=natural_keys) print(image_files) i = 0 file_ptr = open(os.path.join(target_cleaned, filename, filename) + ".txt", "a") for i in range(len(image_files)): print(image_files[i]) img = cv2.imread(os.path.join(target_cleaned, filename, image_files[i])) newdata=pytesseract.image_to_osd(Image.open(os.path.join(target_cleaned, filename, image_files[i])), output_type=Output.DICT) print(newdata, newdata['rotate'], type(newdata), newdata['orientation']) # print(img) img = imutils.rotate_bound(img, newdata['rotate']) # angle=360-int(re.search('(?<=Rotate: )\d+', pytesseract.image_to_osd(Image.open(os.path.join(target_cleaned, filename, image_files[i])))).group(0)) # print('anglle is: ', angle) # (h, w) = img.shape[:2] # if center is None: # center = (w / 2, h / 2) # # Perform the rotation # M = cv2.getRotationMatrix2D(center, angle, scale) # rotated = cv2.warpAffine(img, M, (w, h)) # cv2.imshow(img,"Properly rotated") # rot_data = pytesseract.image_to_osd(Image.open(os.path.join(target_cleaned, filename, image_files[i]))); # print("[OSD] "+rot_data) # rot = re.search('(?<=Rotate: )\d+', rot_data).group(0) # angle = float(rot) # print('angle rotated: ', angle) # # Perform the rotation # M = cv2.getRotationMatrix2D(center, angle, scale) # rotated = cv2.warpAffine(img, M, (w, h)) # cv2.imshow(img,"Properly rotated") # rotate the image to deskew it # rotated = imutils.rotate_bound(Image.open(os.path.join(target_cleaned, filename, image_files[i])), angle) #added # # TODO: Rotated image can be saved here # print(pytesseract.image_to_osd(rotated)); text = pytesseract.image_to_string(img, lang='eng') print(len(text)) # print(text) file_ptr.write(text) file_ptr.close() return render_template("spellcheck.html", filename = filename, genre = genre)
def get_text(img_path): pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' print('Tesseract version:', pytesseract.get_tesseract_version()) print('Processing...') # Read image img = cv2.imread(img_path) # Preprocessing image # img = get_grayscale(img) img = remove_noise(img) # img = thresholding(img) # img = dilate(img) # Tesseract OCR scanning """ 'oem' (Optical Engine Method) argument options: 0 Legacy engine only. 1 Neural nets LSTM engine only. 2 Legacy + LSTM engines. 3 Default, based on what is available. """ """ 'psm' (Page Segmentation Mode) argument options: 0 Orientation and script detection (OSD) only. 1 Automatic page segmentation with OSD. 2 Automatic page segmentation, but no OSD, or OCR. 3 Fully automatic page segmentation, but no OSD. (Default) 4 Assume a single column of text of variable sizes. 5 Assume a single uniform block of vertically aligned text. 6 Assume a single uniform block of text. 7 Treat the image as a single text line. 8 Treat the image as a single word. 9 Treat the image as a single word in a circle. 10 Treat the image as a single character. 11 Sparse text. Find as much text as possible in no particular order. 12 Sparse text with OSD. 13 Raw line. Treat the image as a single text line, """ # 'tessedit_char_blacklist' = mengabaikan karakter yang tidak diinginkan. custom_config = r'-l ind+eng -c tessedit_char_blacklist=0123456789 --oem 3 --psm 6' text = pytesseract.image_to_string(img, config=custom_config) print('Done.') print('Scan results:', text) return text
def __init__( self, custom_config=r"--oem 3 --psm 6", tesseract_path="", verbose=False, ): self.custom_config = custom_config self.tesseract_path = tesseract_path # set up tesseract if tesseract_path != "": pytesseract.tesseract_cmd = tesseract_path if verbose: print("Tesseract version:", get_tesseract_version()) return
def ocr(pic_name, output_path, ocr_lang='chi_sim'): im = cv2.imread(pic_name) if im is None: print('Image file not exists!') exit(-1) im = cv2.resize(im, None, fx=1 / 4, fy=1 / 4, interpolation=cv2.INTER_LINEAR) print('image size:', im.shape) print(ts.get_tesseract_version()) stri = ts.image_to_string(im, lang=ocr_lang) boxes = ts.image_to_boxes(im, lang=ocr_lang) data = ts.image_to_data(im, lang=ocr_lang, output_type=ts.Output.DICT) #osd = ts.image_to_osd(im, lang='eng') df = pd.DataFrame(data) df = df[['left', 'width', 'top', 'height', 'text', 'conf']] df = df[df['conf'] > 60] df = df[df['text'].str.replace(' ', '').replace('\t', '').str.len() > 0] df = df.sort_values(['top', 'left']) print(df) draw_boxes(im, df, pic_name, output_path)
def main(): runningGame() print(pytesseract.get_tesseract_version()) gameStart()
''' Created on 02-Oct-2020 @author: somsh ''' import cv2 import numpy as np import matplotlib.pyplot as plt import os import pytesseract as tess import pdf2image from PIL import Image,ImageEnhance,ImageFilter pages=pdf2image.convert_from_path('D:\Software\eclipse\jee-2019-12\eclipse-workspace\pdf_word_convert\pdf_word_convert\pp5.pdf',1000) for page in pages: page.save('pp5.jpg','JPEG') im=Image.open("pp5.jpg") im=im.convert('RGB') # im=im.filter(ImageFilter.MedianFilter()) # enhancer=ImageEnhance.Contrast(im) # im=enhancer.enhance(2) # im=im.convert('1') im.save("enh_pp5.jpg") tess.pytesseract.tesseract_cmd='D:/Software/Tesseract-OCR/tesseract.exe' print(tess.get_tesseract_version()) text=tess.pytesseract.image_to_string('enh_pp5.jpg',lang='ben') f=open('pp5.txt','w',encoding="utf-8") f.write(text) f.close()
import cv2 import numpy as np import pytesseract import argparse import os from imutils import resize, grab_contours from skimage.filters import threshold_local from pyimagesearch.transform import four_point_transform try: from PIL import Image except ImportError: import Image print("Using tesseract version:", pytesseract.get_tesseract_version()) # Process command line arguments: ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="Path to image to be parsed") ap.add_argument("-p", "--preprocess", type=str, default="thresh", help="Type of preprocessing used (Default: thresh)") ap.add_argument("-e", "--engine", type=str,
import cv2 import pytesseract import tensorflow as tf import sys import keras print("[INFO] python=" + sys.version) print("[INFO] cv2=" + str(cv2.__version__)) print("[INFO] tensorflow=" + str(tf.__version__)) print("[INFO] keras=" + str(keras.__version__)) print("[INFO] tesseract=" + str(pytesseract.get_tesseract_version()))
def __init__( self, input_dir=None, out_file=None, *, files_list=None, task_class=PdfExtractTask, # Config params small=False, check_input=True, chunksize=None, saving_interval=5000, max_files_memory=3000, files_pattern='*.pdf', # Task_params ocr=False, ocr_image_size=None, ocr_lang='por', features='all', image_format='jpeg', image_size=None, **ray_params): self.input_dir = Path(input_dir).resolve() if input_dir else None self.files_list = [Path(f) for f in files_list] if files_list else None self.out_file = Path(out_file).resolve() if out_file else None if check_input: self._check_input() if not small: self._check_outfile() if ocr: # Will raise exception if tesseract was not found get_tesseract_version() self.num_cpus = ray_params.get('num_cpus') or os.cpu_count() self.ray_params = ray_params self.chunksize = chunksize self.small = small self.max_files_memory = max_files_memory self.files_pattern = files_pattern self.num_skipped = None self.task_class = task_class self.task_params = { 'sel_features': features, 'ocr': ocr, 'ocr_lang': ocr_lang, 'ocr_image_size': ocr_image_size, 'image_format': image_format, 'image_size': image_size, } columns = self.list_columns() schema = self.task_class.get_schema(columns) max_results_size = saving_interval if not small else None self.results = Results(self.input_dir, self.out_file, schema, max_size=max_results_size) self.results_queue = Queue(max_files_memory)
print(f"[DEBUG] mapped parts: {parts}") return parts ################ # SETUP ################ # create config file if not already present if not configPath.exists(): print("No config found") writeConfig() readConfig() # pytesseract version print("[INFO] currently using tesseract: " + str(pytesseract.get_tesseract_version())) # other languages can be installed by # sudo apt install tesseract-ocr-[language code] langs = pytesseract.get_languages(config="") print(f"[INFO] following languages are availible:\n {langs}") # get a numerical sorted list of all files and the total count numbers = re.compile(r"(\d+)") # matches numerical token with multiple digits files = sorted(listdir(path=src_image_path), key=numericalSort) totalNumOfImages = len(files) if enable_debug: print(f'[INFO] {totalNumOfImages} image files are present in "{src_image_path}"') print(f"[INFO] Following files are present:\n {files}") ################
def get_tesseract_version() -> version.Version: """Get info abput tesseract setup.""" tesseract_version = str( pytesseract.get_tesseract_version()).splitlines()[0] return version.parse(tesseract_version)
def read_root(): return pytesseract.get_tesseract_version()
import cv2 # 3.4.2 import pytesseract # 5.0.0-alpha.20200328 from PIL import Image, ImageEnhance print(cv2.__version__) pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' print(pytesseract.get_tesseract_version()) orig_name = "imgs/test_marker.jpg" rotated = "imgs/rotated.jpg" resize = False params = '-c tessedit_char_whitelist=0123456789. --psm 11 --dpi 72' img = cv2.imread(rotated) height, width, channels = img.shape imgResized = img if resize: imgResized = cv2.resize(img, (width * 3, height * 3)) im = Image.fromarray(imgResized) cv2.imshow("img", imgResized) cv2.waitKey() data = pytesseract.image_to_string(img, config=params) print(data) h, w, c = img.shape boxes = pytesseract.image_to_boxes(img, lang='rus', config=params)
except (SystemError, NameError): trayicon = QtWidgets.QSystemTrayIcon( QtGui.QIcon( QtGui.QPixmap.fromImage(QtGui.QImage(1, 1, QtGui.QImage.Format_Mono)) ) ) trayicon.show() trayicon.showMessage("TextShot", msg, QtWidgets.QSystemTrayIcon.NoIcon) trayicon.hide() if __name__ == "__main__": QtCore.QCoreApplication.setAttribute(Qt.AA_DisableHighDpiScaling) app = QtWidgets.QApplication(sys.argv) try: pytesseract.get_tesseract_version() except EnvironmentError: notify( "Tesseract is either not installed or cannot be reached.\n" "Have you installed it and added the install directory to your system path?" ) print( "ERROR: Tesseract is either not installed or cannot be reached.\n" "Have you installed it and added the install directory to your system path?" ) sys.exit() window = QtWidgets.QMainWindow() snipper = Snipper(window) snipper.show() sys.exit(app.exec_())
#save the dataframe(pdf) data into csv save_to_csv(df,PARSE_DATA_CSVS+pdf_file_name_without_ext+".csv") print("CSV saved") except Exception as e: print('ERROR:', e, pdf_file_name_without_ext) traceback.print_exc() finally: print("Clean up working files...") shutil.rmtree(input_pdf_images_path, ignore_errors=True) shutil.rmtree(input_images_blocks_path, ignore_errors=True) end_time = time.time() return pdf_file_name_without_ext, end_time - begin_time if __name__ == '__main__': print('Tesseract Version:', pytesseract.get_tesseract_version()) print('multiprocessing cpu_count:', multiprocessing.cpu_count()) print('os cpu_count:', os.cpu_count()) print('sched_getaffinity:', len(os.sched_getaffinity(0))) #a_pool = multiprocessing.Pool(multiprocessing.cpu_count()) #results = a_pool.map(pdf_process, state_pdfs_files) with MPIPoolExecutor() as executor: results = executor.map(pdf_process, state_pdfs_files) for res in results: print(res)
print(filename) print("Removing the MSS screenshot.") import os os.remove('mss_fullscreen.png') except (IOError, OSError) as e: print bcolors.FAIL + "ERROR" + bcolors.ENDC print e except: print("Unexpected error:", sys.exc_info()[0]) raise else: print "" print "Version: " + mss.__version__ print "" print bcolors.BOLD + bcolors.OKGREEN + "MSS working correctly." + bcolors.ENDC print "_____________________________________________________________________________________" print "" print bcolors.OKBLUE + "Testing Pytesseract" + bcolors.ENDC print "" try: print "pytesseract.get_tesseract_version()" print pytesseract.get_tesseract_version() except (IOError, OSError) as e: print bcolors.FAIL + "ERROR" + bcolors.ENDC print e else: print("") print bcolors.BOLD + bcolors.OKGREEN + "Pytesseract working correctly." + bcolors.ENDC
def hasOCR(): try: pytesseract.get_tesseract_version() return True except: return False
def handler_tesseract(): result_bucket = [] if request.method == "GET": html = "<title>Tesseract</title>" \ "<h2>Hello, Tesseract Server!</h2>" \ "<i>Current Ver: %s </i>" % pytesseract.get_tesseract_version() return html elif request.method == "POST": b64_img = request.form.get("b64_img") files = request.files if not b64_img and not files: result_bucket.append( dict(result=0, err="Missing params: b64_img or files .")) elif files: for fk in files: file = files[fk] async_result = ocr_tesseract.apply_async( kwargs=dict(im_buff=BytesIO(file.read())), serializer="pickle", ) response = dict(origin_name=fk) if async_result.status == "SUCCESS": response["result"] = 1 response["txt"] = async_result.result else: response["result"] = 0 response["celery_id"] = async_result.task_id response["celery_st"] = async_result.status result_bucket.append(response) elif b64_img: decode_data = base64.b64decode(b64_img) async_result = ocr_tesseract.apply_async( kwargs=dict(im_buff=BytesIO(decode_data)), serializer="pickle", ) response = dict() if async_result.status == "SUCCESS": response["result"] = 1 response["txt"] = async_result.result else: response["result"] = 0 response["celery_id"] = async_result.task_id response["celery_st"] = async_result.status result_bucket.append(response) return jsonify(result_bucket)
def main(): args = parse_args() if int(str(pytesseract.get_tesseract_version())[0]) < 4: sys.exit('Tesseract 4.0.0 or greater required!') if args.everything: jetson_UART = "/dev/ttyTHS1" drawer = drw.Drawer(jetson_UART) cam = cv.VideoCapture(0, cv.CAP_V4L2) cam.set(3, 1280) # height cam.set(4, 720) # width xyz = capture_image(cam) xyz_params = chessboard_calibrate('calibration_dummy', 6, 8, debug=False) ret, mtx, dist, rvecs, tvecs = xyz_params h, w = xyz.shape[:2] newcameramtx, roi = cv.getOptimalNewCameraMatrix(mtx, dist, (w, h), 1, (w, h)) xyz = cv.undistort(xyz, mtx, dist, None, newcameramtx) x, y, w, h = roi xyz = xyz[y:y + h, x:x + w] xy_check = i2wt.uv_to_xy(xyz, xyz_params, [], True) display(xy_check[0]) print(xy_check[1]) while True: try: if args.image: img = cv.imread(args.image) else: img = capture_image(cam) # Camera Calibration # param order ret, mtx, dist, rvecs, tvecs if not args.image: params = chessboard_calibrate('calibration', 6, 8, debug=False) ret, mtx, dist, rvecs, tvecs = params h, w = img.shape[:2] newcameramtx, roi = cv.getOptimalNewCameraMatrix( mtx, dist, (w, h), 1, (w, h)) img = cv.undistort(img, mtx, dist, None, newcameramtx) x, y, w, h = roi img = img[y:y + h, x:x + w] #display(img, 'Calibration Output') img = remove_shadow(img) puzzle, bank, x_offset, y_offset = segment(img, True) detected_puzzle, detected_bank, char_coords = tesseract(puzzle, bank, x_offset, y_offset, debug=True, img=img) solved_word_points = permutative_solve(detected_bank, detected_puzzle) print(solved_word_points) solved_uv_points = i2wt.wordsearch_to_uv(char_coords, solved_word_points) print( solved_uv_points ) #char_coords[solved_uv_points[0][0][0]][solved_uv_points[0][0][1]][2]) #solved_uv_points = [[[[468 ], [222]],[[470],[642]]], [[[764],[446]], [[1064],[220]]]] to_MSP_points = i2wt.uv_to_xy(xyz, xyz_params, solved_uv_points, False) display(to_MSP_points[0]) if args.everything: scaling_factor_x = 0.22 scaling_factor_y = 0.22 start_offset_x = 3.75 start_offset_y = 6.6 drawer.read(1) for point_pair in to_MSP_points[1]: x1 = int( round((point_pair[0][0] + start_offset_x) / scaling_factor_x)) y1 = int( round((point_pair[0][1] + start_offset_y) / scaling_factor_y)) x2 = int( round((point_pair[1][0] + start_offset_x) / scaling_factor_x)) y2 = int( round((point_pair[1][1] + start_offset_y) / scaling_factor_y)) to_draw = [(x1, y1), (x2, y2)] drawer.draw(to_draw) drawer.read(1) drawer.send(255) cv.destroyAllWindows() except Exception as e: print(e) except (KeyboardInterrupt): print('See ya later!') if args.everything: drawer.cleanup() break
def extract_time(video, log): print(pytesseract.get_tesseract_version()) threshold_error = timedelta(hours=1, minutes=0) ocr_time_failed = False file_time_failed = False file_name_time = None file_name_date = None default_time = timedelta(hours=9, minutes=0) default_date = "2020-05-28" try: file_name_date, file_name_time = extract_date(video) ## check date format date_match = datetime.strptime(file_name_date, "%Y-%m-%d") except Exception as e: log.write("ERROR in extracting the date-time from the file_name\n") log.write(str(e) + "\n") file_time_failed = True try: video_object = cv2.VideoCapture(video) print(video) ret, frame = video_object.read() print(ret) ocr_time_stamp = get_timestamp(frame) ocr_date, ocr_time = clean_OCR_Time(ocr_time_stamp) except Exception as e: log.write(video + "ERROR in extracting the date-time from the OCR\n") log.write(str(e) + "\n") ocr_time_failed = True if (file_time_failed and ocr_time_failed): log.write("Using a default time_stamp " + default_date + 'T' + str(default_time) + "\n") return (default_date, default_time) elif ocr_time_failed: log.write("Using file extracted time_stamp " + file_name_date + "T" + str(file_name_time) + "\n") file_name_date, file_name_time = convert_to_UTC( file_name_date, file_name_time) return (file_name_date, file_name_time) elif file_time_failed: log.write("Using OCR extracted time_stamp and OCR date " + ocr_date + "T" + str(ocr_time) + "\n") ocr_date, ocr_time = convert_to_UTC(ocr_date, ocr_time) return (ocr_date, ocr_time) else: if abs(ocr_time - file_name_time) < threshold_error: log.write("Using OCR timestamp " + file_name_date + "T" + str(ocr_time) + "\n") file_name_date, ocr_time = convert_to_UTC(file_name_date, ocr_time) return (file_name_date, ocr_time) else: log.write("Using file_name timestamp " + file_name_date + "T" + str(file_name_time) + "\n") file_name_date, file_name_time = convert_to_UTC( file_name_date, file_name_time) return (file_name_date, file_name_time)
#!/usr/bin/env python import platform import os print("Platform {}".format(platform.platform())) print("Python v {}".format(platform.python_version())) from PyQt5 import QtCore; print("Successfully installed PyQt v. {}".format(QtCore.PYQT_VERSION_STR)) import vtk print("Successfully installed vtk v. {}".format(vtk.vtkVersion.GetVTKSourceVersion())) import pytesseract print("Successfully installed pytesseract v. {}".format(pytesseract.get_tesseract_version())) try: import caffe print("Successfully installed pycaffe") except: print("Error: pycaffe not installed ! (python3?)")
def get_pixelsize(self, debug=False): """ Reads the scalebar from images of the Tecnai TEM microscopes using text recognition via pytesseract or with manual input when pytesseract is not installed Parameters ---------- debug : bool, optional enable debug mode which prints extra information and figures to troubleshoot any issues with calibration. The default is False. Returns ------- pixelsize : float the pixelsize in calibrated (physical) units unit : string the physical unit of the pixelsize """ import re #find contour corners sorted left to right if len(self.scalebar) == 0: print('[WARNING] tecnai.get_pixelsize: original scale bar not found!') pixelsize = float(input('Please give pixelsize in nm: ')) self.unit = 'nm' self.pixelsize = pixelsize return pixelsize,'nm' else: if int(cv2.__version__[0]) >= 4: corners,_ = cv2.findContours(self.scalebar,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE) else: _,corners,_ = cv2.findContours(self.scalebar,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE) corners = sorted(corners, key=lambda c: cv2.boundingRect(c)[0]) #length in pixels between bottom left corners of vertical bars barlength = corners[0][7,0,0]-corners[0][1,0,0] if debug: import matplotlib.pyplot as plt print('\n------- DEBUGGING IMAGE CALIBRATION -------') print('- length:',barlength,'pixels') plt.figure('[DEBUG MODE] scale bar corners') plt.imshow(self.scalebar) plt.scatter(corners[0][:,0,0],corners[0][:,0,1],color='r',label='corners') plt.scatter(corners[0][[1,7],0,0],corners[0][[1,7],0,1],color='green',label='used for calibration') plt.legend() plt.show(block=False) #take the text of the databar bartext = self.scalebar[:, min(corners[1][:,0,0])-int(6*self.shape[1]/1024):\ max(corners[-1][:,0,0])+int(6*self.shape[1]/1024+1) ] bartext = bartext.max() - bartext #upscale if needed for OCR if self.shape[1] < 4096: if self.shape[1] < 2048: factor = 4 else: factor = 2 bartextshape = np.shape(bartext) bartext = cv2.resize( bartext, (factor*bartextshape[1],factor*bartextshape[0]), interpolation = cv2.INTER_CUBIC ) bartext = cv2.erode( cv2.threshold(bartext,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1], np.ones((5,5),np.uint8) ) if debug: print('- preprocessing text, resizing text image from',bartextshape,'to',np.shape(bartext)) try: #load tesseract-OCR for reading the text import pytesseract #switch error handling from a ValueError (we may also raise later #in case of text recognition problems) to one we can only raise #here, so we can give the correct warning try: tesseract_version = float(str(pytesseract.get_tesseract_version())[:3]) except ValueError: raise FileNotFoundError #settings vary per version, so use tesseract_verion to use correct if tesseract_version == 4.0: text = pytesseract.image_to_string( bartext, config="--oem 0 -c tessedit_char_whitelist=0123456789pnuµm --psm 7" ) #oem 0 selects older version of tesseract which still takes the char_whitelist param #tessedit_char_whitelist takes list of characters it searches for (to reduce reading errors) #psm 7 is a mode that tells tesseract to assume a single line of text in the image else: text = pytesseract.image_to_string( bartext, config="-c tessedit_char_whitelist=0123456789pnuµm --psm 7" ) #since version 4.1 char whitelist is added back text = text.replace('\x0c','') if debug: plt.figure('[DEBUG MODE] scale bar text') plt.imshow(bartext) plt.show(block=False) print('- text:',text) #split value and unit value = float(re.findall(r'\d+',text)[0]) unit = re.findall(r'[a-z]+',text)[0] #give different warnings for missing installation or reading problems except ImportError: print('pytesseract not found, defaulting to manual mode') unit = input('give scale bar unit: ') value = float(input('give scale bar size in '+unit+': ')) except FileNotFoundError: print('[WARNING] tecnai.get_pixelsize(): tesseract OCR engine was'+ ' not found by pytesseract. Switching to manual mode.') unit = input('give scale bar unit: ') value = float(input('give scale bar size in '+unit+': ')) except: print('[WARNING] tecnai.get_pixelsize(): could not read scale bar'+ ' text, perhaps try debug=True. Switching to manual mode.') unit = input('give scale bar unit: ') value = float(input('give scale bar size in '+unit+': ')) if unit == 'um': unit = 'µm' #determine pixelsize pixelsize = value/barlength if debug: print('- value:',value) print('- unit:',unit) print('- 2 figures created') print('-------------------------------------------\n') print('Original scale bar: {:.3g}'.format(value),unit) print('Pixel size: {:.5g}'.format(pixelsize),unit) self.pixelsize = pixelsize self.unit = unit self.scalebarlength = value self.scalebarlength_px = barlength return pixelsize,unit
if numpy_installed: import numpy as np if pandas_installed: import pandas try: from PIL import Image except ImportError: import Image IS_PYTHON_2 = version_info[:1] < (3, ) IS_PYTHON_3 = not IS_PYTHON_2 TESSERACT_VERSION = tuple(get_tesseract_version().version) # to skip tests DATA_DIR = path.join(path.dirname(path.abspath(__file__)), 'data') TEST_JPEG = path.join(DATA_DIR, 'test.jpg') pytestmark = pytest.mark.pytesseract # used marker for the module string_type = unicode if IS_PYTHON_2 else str # noqa: 821 @pytest.fixture(scope='session') def test_file(): return TEST_JPEG @pytest.fixture(scope='session') def test_invalid_file():
# OpenCV(4.1.0) pip3 install opencv-python # Python Image Library -> Pillow(6.0.0) pip3 install pillow # Numpy(1.16.2) pip3 install numpy # Scipy(1.2.1) pip3 install scipy # Matplotlib(3.0.3) pip3 install matplotlib # Imutils(0.5.2) pip3 install imutils # Tesseract(4.0.0) pip3 install pytesseract # Sklearn(0.21.3) pip3 install -U scikit-learn # Utils(0.21.3) pip3 install utils import cv2 import platform import PIL.Image as Image import numpy as np import scipy import matplotlib as mpl import imutils import pytesseract import sklearn print("You are running python", platform.python_version()) print("You are running opencv", cv2.__version__) print("You are running pillow", Image.PILLOW_VERSION) print("You are running numpy", np.version.version) print("You are running scipy", scipy.version.version) print("You are running matplotlib", mpl.__version__) print("You are running imutils", imutils.__version__) print("You are running tesseract", pytesseract.get_tesseract_version()) print("You are running sklearn", sklearn.__version__)