def __init__(self, empty_char=properties.empty_char, is_eval=False): self.empty_char = empty_char self.is_eval = is_eval self.api_single_line = tesserocr.PyTessBaseAPI( lang='eng', psm=tesserocr.PSM.SINGLE_LINE, path=properties.tesseract_path, oem=tesserocr.OEM.LSTM_ONLY) self.api_single_block = tesserocr.PyTessBaseAPI( lang='eng', psm=tesserocr.PSM.SINGLE_BLOCK, path=properties.tesseract_path)
def name_reader(): """Read all of the images as they get introduced to the generater.""" # First, build the character list. # The only characters that should be in this are 0-9 and any character in # the NAME_FORMATS. ## char_list = set(''.join(NAME_FORMATS)) ## char_list.remove("#") # It is in the NAME_FORMATS but has special meaning. ## # Because the list needs to be a string, convert it. ## # Don't forget the numbers. We need those. ## char_list = char_list.union("0123456789") ## char_list = ''.join(sorted(char_list)) # For test, trying a static list that intentially does not have all # letters. char_list = " 0123456789MPQTacefhilmnopqrstuy" image = yield None # Call the tesseract library and build the processing object ("ocr"). # Specify that all text should be in a single line (SINGLE_LINE). with tesserocr.PyTessBaseAPI(psm=tesserocr.PSM.SINGLE_LINE) as ocr: # Set the character list. ocr.SetVariable("tessedit_char_whitelist", char_list) while True: # Process all of the imates as they are passed by .send() ocr.SetImage(image) match_name = ocr.GetUTF8Text() # Previously, confidence was also returned. However, it is not useful. image = yield match_name if ADAPTIVE_CLASSIFIER: ocr.ClearAdaptiveClassifier()
def parse(url): '''Take url or path to file and return OCR'd text. Flag failure with non-200 status code. ''' if _is_url(url): try: buffer = cStringIO.StringIO(urllib.urlopen(url).read()) img = Image.open(buffer) res = ocr(img) res['url'] = url return res except: return { 'status': 500, 'caption': "Couldn't retrieve the image at %s" % url, 'url': url } else: # is path to file with tesserocr.PyTessBaseAPI() as api: api.SetImageFile(url) text = api.GetUTF8Text().strip() return { 'status': 200 if text else 415, 'caption': text or 'Text not found.', 'url': url }
def imge(img_url): cv_img = cv2.imread(img_url, cv2.IMREAD_UNCHANGED) # since tesserocr accepts PIL images, converting opencv image to pil pil_img = Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)) #initialize api api = tr.PyTessBaseAPI() try: # set pil image for ocr api.SetImage(pil_img) # Google tesseract-ocr has a page segmentation methos(psm) option for specifying ocr types # psm values can be: block of text, single text line, single word, single character etc. # api.GetComponentImages method exposes this functionality # function returns: # image (:class:`PIL.Image`): Image object. # bounding box (dict): dict with x, y, w, h keys. # block id (int): textline block id (if blockids is ``True``). ``None`` otherwise. # paragraph id (int): textline paragraph id within its block (if paraids is True). # ``None`` otherwise. boxes = api.GetComponentImages(tr.RIL.TEXTLINE, True) # get text text = api.GetUTF8Text() # iterate over returned list, draw rectangles for (im, box, _, _) in boxes: x, y, w, h = box['x'], box['y'], box['w'], box['h'] cv2.rectangle(cv_img, (x, y), (x + w, y + h), color=(0, 255, 0)) cv2.putText(cv_img, 'text', (x, y - 5), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 255), 1) finally: api.End() cv2.imwrite(img_url, cv_img)
def ocr_whole(self): with tr.PyTessBaseAPI( path='C:\\Program Files\\Tesseract-OCR\\tessdata\\', lang='eng') as api: api.SetPageSegMode(tr.PSM.AUTO_OSD) item = self.img pil_img = Image.fromarray(cv2.cvtColor(item, cv2.COLOR_BGR2RGB)) api.SetImage(pil_img) item_sub_marked = np.copy(item) boxes = api.GetComponentImages(tr.RIL.BLOCK, True) # get text # text = api.GetUTF8Text() # print(text) # iterate over returned list, draw rectangles for (im, box, _, _) in boxes: x, y, w, h = box['x'], box['y'], box['w'], box['h'] cv2.rectangle(item_sub_marked, (x, y), (x + w, y + h), color=(0, 0, 255)) # out = pytesseract.image_to_string(~resized_afterd, lang='eng', config='--psm 3') # out_t = out.strip() # if len(out_t) != 0: # item_text_list.append(out_t) # # s = s + f"\nitem {i}:\n" + out_t # s = s + f"\n" + out_t path = os.path.join(self.img_ocr_marked_dir, f"{self.filename_without_ext}.png") cv2.imwrite(path, item_sub_marked) plt.show()
def image_accuracy_test(images, labels): correctForms, predictedForms = 0, 0 times, accuracyPercentages = [], [] with tesserocr.PyTessBaseAPI() as api: for i in range(len(images)): startTime = int(round(time.time() * 1000)) api.SetImageFile(images[i]) imageText = api.GetUTF8Text().lower( ) #converts the output of OCR to lowercase #print(imageText) predictionAmount, correctPredictions = 0, 0 labelStringArray = (labels[i]).split() for labelStr in labelStringArray: if labelStr.lower( ) in imageText: #compare lowercase label with output of OCR correctPredictions += 1 # else: # print('File: ', images[i],'|||Not recognized: ',labelStr) predictionAmount += 1 predictedForms += 1 if correctPredictions / predictionAmount > .999999: correctForms += 1 accuracyPercentages.append(correctPredictions / predictionAmount) times.append(int(round(time.time() * 1000)) - startTime) print('Forms Processed: ', predictedForms) print('Forms Perfectly Recognized: ', correctForms) print('Average Processing Time: ', sum(times) / len(times), ' milliseconds') print('Average Cell Accuracy: ', 100 * sum(accuracyPercentages) / len(accuracyPercentages), '% of cells correctly recognized\n')
def get_apis(model_names): apis = {} for model_name in model_names: apis[model_name] = tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model_name, psm=tesserocr.PSM.RAW_LINE) return apis
def image_to_string(img, lang): with tesserocr.PyTessBaseAPI(lang=lang, psm=3) as api: api.SetVariable("tessedit_char_whitelist", " \n" + CHARSET) api.SetImage(img) api.Recognize() words = [] level = tesserocr.RIL.WORD for r in tesserocr.iterate_level(api.GetIterator(), level): try: word = r.GetUTF8Text(level) except RuntimeError: continue conf = r.Confidence(level) # print(f"{word} ({conf})") if words: previous = words[-1] if regex.match(r"[\p{Lu}\p{Ll}]+\-$", previous): # dash=>combine words[-1] = words[-1][:-1] + word continue # if conf > 0.95 or (all(c in LETTERS for c in word) and conf > 0.9): # words.append(word) # continue # # print(f"LOWCONF! {word} ({conf})") words.append(word) return filter(bool, words)
def get_word_bounding_boxes(self, page): results = [] boxes = pytesseract.image_to_data(page) api = tr.PyTessBaseAPI() api.SetImage(page) boxes2 = api.GetComponentImages(tr.RIL.WORD, True) print(len(boxes2)) for i, (im, box, _, _) in enumerate(boxes2): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) words = 0 for x, b in enumerate(boxes.splitlines()): if x != 0: b = b.split() if len(b) == 12: words += 1 x, y, w, h, word = int(b[6]), int(b[7]), int(b[8]), int( b[9]), b[11] top_left_corner = (x, y) bottom_right_corner = (w + x, h + y) # page = np.array(page) results.append([(top_left_corner, bottom_right_corner), word]) return results
def ocr_regions(self): def pre_process_region(region): # get rid of the color pre = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY) # Otsu threshold # pre = cv2.threshold(pre, 200, 255, cv2.THRESH_BINARY)[1] return pre with tr.PyTessBaseAPI( path='C:\\Program Files\\Tesseract-OCR\\tessdata\\', lang='eng', psm=tr.PSM.AUTO, oem=tr.OEM.LSTM_ONLY) as api: api.SetPageSegMode(tr.PSM.SINGLE_COLUMN) for i in range(len(self.img_region_list)): item = pre_process_region(self.img_region_list[i]) pil_img = Image.fromarray(item) api.SetImage(pil_img) item_sub_marked = np.copy(item) boxes = api.GetComponentImages(tr.RIL.BLOCK, True) # get text text = api.GetUTF8Text() print(text) # iterate over returned list, draw rectangles for (im, box, _, _) in boxes: x, y, w, h = box['x'], box['y'], box['w'], box['h'] cv2.rectangle(item_sub_marked, (x, y), (x + w, y + h), color=(0, 0, 255)) path = os.path.join(self.img_regions_ocr_marked_dir, f"{self.filename_without_ext}_{i}.png") cv2.imwrite(path, item_sub_marked)
def __init__(self): self._lock = threading.Lock() self._condition = threading.Condition(self._lock) self._queued = [] self._done = [] self._stopped = False self._tesseract = tesserocr.PyTessBaseAPI() self.start()
def setUp(self): self._test_dir = os.path.abspath(os.path.dirname(__file__)) self._image_file = os.path.join(self._test_dir, 'eurotext.tif') if pil_installed: with open(self._image_file, 'rb') as f: self._image = Image.open(f) self._image.load() self._api = tesserocr.PyTessBaseAPI(init=True)
def process(self): """ Performs the cropping. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Cropping with tesseract") tessapi.SetImage(image) # # helper variables for saving the box coordinates # min_x = image.width min_y = image.height max_x = 0 max_y = 0 # iterate over all boxes and compare their extent # to the min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) for pair in points.split(' '): x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) if x < min_x: min_x = x if y < min_y: min_y = y elif x > max_x: max_x = x elif y > max_y: max_y = y log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) # # set the identified page border # brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def _line_height(self, polygon): key = tuple(polygon.centroid.coords[0]) if key not in self._ocr: from .utils import polygons_to_mask mask = polygons_to_mask(self._unbinarized.shape, [polygon]) minx, miny, maxx, maxy = polygon.bounds minx, miny = numpy.floor(numpy.array([minx, miny])).astype(numpy.int32) maxx, maxy = numpy.ceil(numpy.array([maxx, maxy])).astype(numpy.int32) pixels = self._unbinarized[miny:maxy, minx:maxx] mask = mask[miny:maxy, minx:maxx] pixels[numpy.logical_not(mask)] = 255 with tesserocr.PyTessBaseAPI( psm=tesserocr.PSM.SINGLE_BLOCK) as api: api.SetImage(PIL.Image.fromarray(pixels, "L")) heights = [] for i, data in enumerate(api.GetTextlines()): bbox = data[1] heights.append(bbox["h"]) if heights: n_lines = len(heights) lh = numpy.min(heights) else: lh = maxy - miny n_lines = 1 if self._debug: api.Recognize() ri = api.GetIterator() level = tesserocr.RIL.TEXTLINE text = "" #lines = [] for r in tesserocr.iterate_level(ri, level): #baseline = r.Baseline(level) #if baseline: # p1, p2 = baseline # lines.append(shapely.geometry.LineString([p1, p2])) try: text += r.GetUTF8Text(level) + " " except RuntimeError: pass #print("txt", text.strip(), "lh", lh, "#", n_lines) else: text = "" self._ocr[key] = (n_lines, lh, text) return self._ocr[key]
def preprocess(observation): if (observation[0] != None): timeStop = datetime.datetime.now() print("voor frame aanpassing: ", timeStop - timeStart) buttonList = [] observation = np.array( observation[0]['vision']) # convert list to 3D-array img = observation[ 125:387, 9: 270] # convert to 210-50x160 input (geel ook al uitgefilterd) anders x = 75 cv2.imwrite('SequenceBefore.png', img) image = Image.open('SequenceBefore.png') # convert to grayscale img = image.convert('L') img_np = np.array(img) img_np = (img_np > 100) * 255 img = PIL.Image.fromarray(img_np.astype(np.uint8)) img = img.resize((int(img.size[0] * 3.5), int(img.size[1] * 3.5)), PIL.Image.BILINEAR) timeStop = datetime.datetime.now() print("tijd tot aanpassen van frame: ", timeStop - timeStart) with tesserocr.PyTessBaseAPI() as api: api.SetImage(img) boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True) # print('Found {} textline image components.'.format(len(boxes))) if (len(boxes) == 5): for i, (im, box, _, _) in enumerate(boxes): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text().lower() conf = api.MeanTextConf() ocrResult = re.sub('[^a-zA-Z]', '', ocrResult) print("the word is: ", ocrResult) #TODO get correct vector try: # vector = np.random.random(300) vector = modelNLP.wv[ocrResult] # print("Found vector") x__ = box['x'] / 3.3 + 20 y__ = box['y'] / 3.3 + 140 # print("x: " , x__) # print("y: " , y__) buttonList.append(Button(x__, y__, vector)) except KeyError as err: print("Word not found in google...") # return None if len(buttonList) == 5: timeStop = datetime.datetime.now() print(" tijd om alle OCR uit te voeren en de Word2Vec te doen: ", timeStop - timeStart) return buttonList else: return None return None
def do_vanish(self, dark=False): """ detect text in image and remove this applying a mask :param dark: default param for different operations based on image brightness :return: """ org_img = cv2.imread(self.impath, cv2.IMREAD_UNCHANGED) # rename original image cv_img = org_img # if parameter is not True, try text detection with additional thresholding if not dark: cv_img = cv2.threshold(org_img, 209, 255, cv2.THRESH_BINARY)[1] # convert image to single channel gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) # convert opencv image to pil for processing with tesserocr pil_img = Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)) # initialize tesserocr api api = tr.PyTessBaseAPI() try: # set pil image for ocr class api.SetImage(pil_img) # get bounding boxes of text boxes = api.GetComponentImages(tr.RIL.TEXTLINE, True) # make copy of image for processing rec_img = cv_img.copy() # try text detection with tesserocr api text = api.GetUTF8Text() # try again with other param if no text found if not text.strip() and dark == False: self.do_vanish(dark=True) # iterate over returned list, draw rectangles for (im, box, _, _) in boxes: x, y, w, h = box['x'], box['y'], box['w'], box['h'] cv2.rectangle(rec_img, (x, y), (x + w, y + h), color=(0, 0, 255)) # convert bounding boxes to coordinates coordinates = self.boxes_to_coordinates(boxes) # create mask of detected characters in image mask = self.create_mask(gray, coordinates) # convert to 8bit for inpainting function gray_8 = (mask).astype('uint8') # paint over the mask null values dst_TELEA = cv2.inpaint(org_img, gray_8, 3, cv2.INPAINT_TELEA) finally: api.End() return dst_TELEA
def ocr_tesserocr(im, alphabet=None, single_line=False): with tesserocr.PyTessBaseAPI() as tess: if single_line: tess.SetPageSegMode(tesserocr.PSM.SINGLE_LINE) if alphabet: tess.SetVariable("tessedit_char_whitelist", alphabet) tess.SetImage(im) result = tess.GetUTF8Text() return result
def font_details(req_image): with tesserocr.PyTessBaseAPI() as api: #image = Image.open(io.BytesIO(req_image)) #image = req_image api.SetImageFile(req_image) api.Recognize() # required to get result from the next line iterator = api.GetIterator() info = iterator.WordFontAttributes() print(info) return info["bold"], info["font_name"], info["pointsize"]
def _run(self): self._check_tessdir() self._prime_frame_queue() with tesserocr.PyTessBaseAPI(lang=self._config.language) as api: api.SetVariable("tessedit_write_images", "T") for key, value in self._config.tesseract_variables.items(): _logger.info('Setting tesseract variable %s=%s', key, value) api.SetVariable(key, value) if self._config.clear_adaptive_classifier: _logger.info('Clearing adaptive classifier after each run') for counter in itertools.count(): frame_data = self._frame_queue.get() if not frame_data: break region_info = self._compute_regions() image = self._preprocess_image(frame_data, region_info) assert image.mode == 'L' api.SetImageBytes(bytes(image.getdata(0)), image.width, image.height, 1, image.width) text = api.GetUTF8Text().strip() ocr_image = api.GetThresholdedImage() if text: confidence = api.MeanTextConf() white_confidence = self._compute_white_region_confidence( ocr_image, region_info) confidence -= 100 - white_confidence confidence = max(0, confidence) self._text_filter.feed_text( text, confidence=confidence, section=self._config.section_name) else: self._text_filter.flush_text() if counter % (self._config.fps * 2) == 0: debug_image = self._render_debug_image( image, ocr_image, api, region_info) self._text_filter.feed_image( debug_image, section=self._config.section_name) if self._config.clear_adaptive_classifier: api.ClearAdaptiveClassifier() self._text_filter.flush_text(0) _logger.info('Tesseract quit')
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): points, index = points_from_xywh( component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion( TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def ocr(name): path = Path.cwd()/"tmp"/name/"regions" regions = get_data(name) for image in path.glob("*.tiff"): img = Image.open(image) with tr.PyTessBaseAPI(psm=tr.PSM.SINGLE_LINE) as api: api.SetImage(img) text = api.GetUTF8Text().replace("\n","").split() conf = api.AllWordConfidences() update_field(name,str(image),"text",text) update_field(name,str(image),"word_conf",conf) docManager.delete_regions(name)
def preprocess(observation): if (observation[0] != None): buttonList = [] observation = np.array( observation[0]['vision']) # convert list to 3D-array img = observation[ 125:387, 9: 270] # convert to 210-50x160 input (geel ook al uitgefilterd) anders x = 75 cv2.imwrite('SequenceBefore.png', img) image = Image.open('SequenceBefore.png') # convert to grayscale img = image.convert('L') img_np = np.array(img) img_np = (img_np > 100) * 255 img = PIL.Image.fromarray(img_np.astype(np.uint8)) img = img.resize((int(img.size[0] * 3.5), int(img.size[1] * 3.5)), PIL.Image.BILINEAR) with tesserocr.PyTessBaseAPI() as api: api.SetImage(img) boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True) # print('Found {} textline image components.'.format(len(boxes))) if (len(boxes) == 5): for i, (im, box, _, _) in enumerate(boxes): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text().lower() conf = api.MeanTextConf() ocrResult = re.sub('[\s+]', '', ocrResult) # print("the word is: " , ocrResult) #TODO vecotr afhankelijk van OCRresult try: vector = modelNLP.wv[ocrResult] vector = np.random.normal(vector, vectorDeviation) except KeyError as err: vector = modelNLP.wv["house"] print("word not found in google...") # return None x__ = box['x'] / 3.3 + 15 y__ = box['y'] / 3.3 + 140 # print("x: " , x__) # print("y: " , y__) buttonList.append(Button(x__, y__, vector)) if len(buttonList) == 5: return buttonList else: return None return None
def get_tesserocr_api(): """Get tesserocr api depending on different platform.""" import subprocess import sys if sys.platform == 'linux': api = tesserocr.PyTessBaseAPI() elif sys.platform == 'win32': try: p = subprocess.Popen( 'where tesseract', stdout=subprocess.PIPE, shell=True) s = p.communicate()[0].decode('utf-8').split('\\') path = s[:-1] + ['tessdata'] tessdata_path = '/'.join(path) api = tesserocr.PyTessBaseAPI(path=tessdata_path) except RuntimeError: raise RuntimeError( 'Please install tesseract first.\n Check out the' ' installation guide at' ' https://github.com/UB-Mannheim/tesseract/wiki') else: raise NotImplementedError return api
def time_reader(): """Read all of the images as they get introduced to the generater.""" image = yield None # Call the tesseract library and build the processing object ("ocr"). # Specify that all text should be in a single chunk (SINGLE_WORD). with tesserocr.PyTessBaseAPI(psm=tesserocr.PSM.SINGLE_WORD) as ocr: # We are looking for time. This means we are looking for numbers. ocr.SetVariable("tessedit_char_whitelist", "0123456789") while True: ocr.SetImage(image) # Set the image. match_time = ocr.GetUTF8Text() # Get the result (takes a bit) image = yield match_time # Return and get new image. if ADAPTIVE_CLASSIFIER: ocr.ClearAdaptiveClassifier()
def get_textlines(pil_image): with tesserocr.PyTessBaseAPI() as api: api.SetImage(pil_image) boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True) for i, (im, box, _, _) in enumerate(boxes): api.SetRectangle(box['x'], box['y'], box['w'], box['h']) text = api.GetUTF8Text().strip() if not text: continue yield { 'box': box, 'confidence': api.MeanTextConf(), 'text': text, 'image': im, }
def frame(self): import tesserocr with tesserocr.PyTessBaseAPI(psm=tesserocr.PSM.AUTO_ONLY) as api: api.SetImage(self.foreground) pts = [] for _, bbox in api.GetConnectedComponents(): x = bbox["x"] y = bbox["y"] w = bbox["w"] h = bbox["h"] pts.append((x, y)) pts.append((x + w, y + h)) pts = numpy.array(pts, dtype=numpy.int32) return cv2.boundingRect(pts)
def tessocr(impath, mnm): namelist = [] # impath=key namelist.append(mnm) outdict = {} impath = 'thresimages/' + impath for i in (7, 8, 9, 10, 13): with tesserocr.PyTessBaseAPI(path='tessdata', lang='eng', psm=i) as api: api.SetImageFile(impath) a = api.GetUTF8Text() namelist.append(re.sub('\W+', '', a)) print(namelist) outdict[i] = namelist namelist = [] with open(impath[:-4] + '.txt', 'w') as json_file: json.dump(outdict, json_file, indent=4)
def block_detail(path, left, top, right, bottom): with tesserocr.PyTessBaseAPI(lang='chi_sim+eng') as api: api.SetImageFile(path) api.SetVariable("save_blob_choices", "T") # api.SetRectangle(left, top, right, bottom) api.SetRectangle(left, top, (right-left)*0.9, (bottom-top)*0.9) # print("执行过1") api.Recognize() # print("执行过2") iterator = api.GetIterator() # print("执行过3") # print(api.GetUTF8Text()) # 识别的具体文字,但是这个并不精准 #vprint(iterator.RowAttributes()) # 每一行的具体信息 这个对结果无意义 dic = iterator.WordFontAttributes() print(dic) if dic is None: return '/* 检测出错 */' else: new_dict = '' if not dic['serif']: new_dict = "font-family: " + dic["font_name"] else: serif = 'serif' new_dict = "font-family: " + dic["font_name"] + ',' + serif if not dic['monospace']: new_dict = new_dict + ";\n" else: new_dict = new_dict + "," + "monospace;\n" if not dic['bold']: new_dict = new_dict + "font-weight: " + "normal;\n" else: new_dict = new_dict + "font-weight: " + "bold;\n" if not dic['italic']: new_dict = new_dict + "font-style: " + "normal;\n" else: new_dict = new_dict + "font-style: " + "italic;\n" if not dic['smallcaps']: new_dict = new_dict + "font-variant: " + "normal;\n" else: new_dict = new_dict + "font-variant: " + "small-caps;\n" if not dic['underlined']: new_dict = new_dict + "text-decoration: " + 'none;\n' else: new_dict = new_dict + "text-decoration: " + "underline;\n" new_dict = new_dict + "font-size: " + str(dic['pointsize']/5) + ';' return new_dict
def ocr(file, lang='eng'): """提取图中文字 :param file: 二维码图片路径 :param lang: 语言, 默认英语 """ img = Image.open(file).convert('L') with tesserocr.PyTessBaseAPI(lang=lang) as api: api.SetImage(denoise(img)) text = api.GetUTF8Text().strip() if text == '': api.SetImage(ImageOps.invert(denoise(img, 100))) text = api.GetUTF8Text().strip() if text == '': text = 'No Text Found!' showtext(text)
def ocr(self): s = "" # item_text_list = [] # for i in range(len(self.img_item_list)): # item = self.img_item_list[i] # out = pytesseract.image_to_string(item) # out_t = out.strip() # if len(out_t) != 0: # item_text_list.append(out_t) # # s = s + f"\nitem {i}:\n" + out_t # s = s + f"\n" + out_t item_text_list = [] sub_list = [] with tr.PyTessBaseAPI( path='C:\\Program Files\\Tesseract-OCR\\tessdata\\', lang='eng') as api: api.SetPageSegMode(tr.PSM.SPARSE_TEXT) for i in range(len(self.img_item_list)): item = self.img_item_list[i] pil_img = Image.fromarray(cv2.cvtColor(item, cv2.COLOR_BGR2RGB)) api.SetImage(pil_img) item_sub_marked = np.copy(item) boxes = api.GetComponentImages(tr.RIL.PARA, True) # get text # text = api.GetUTF8Text() # print(text) # iterate over returned list, draw rectangles for (im, box, _, _) in boxes: x, y, w, h = box['x'], box['y'], box['w'], box['h'] cv2.rectangle(item_sub_marked, (x, y), (x + w, y + h), color=(0, 0, 255)) path = os.path.join(self.img_item_sub_marked_dir, f"{self.filename_without_ext}_{i}.png") cv2.imwrite(path, item_sub_marked) for i in range(len(sub_list)): path = os.path.join(self.img_item_sub_dir, f"{self.filename_without_ext}_{i}.png") cv2.imwrite(path, sub_list[i]) print(f"{self.filename_without_ext}============") print(s) print("============")