def add(self, image, split=lambda x: os.path.splitext(x)[0], suffix='.gt.txt', normalization=None, reorder=True, pad=16): """ Adds a single image to the training set. """ with click.open_file(split(image) + suffix, 'r', encoding='utf-8') as fp: gt = fp.read() if normalization: gt = unicodedata.normalize(normalization, gt) if reorder: gt = bd.get_display(gt) im = Image.open(image) im = rpred.dewarp(self.lnorm, im) im = pil2array(im) im = lstm.prepare_line(im, pad) self.training_set.append((im, gt))
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ lnorm = getattr(network, 'lnorm', CenterNormalizer()) for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: yield ocr_record('', [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): yield ocr_record('', [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: yield ocr_record('', [], []) continue line = pil2array(box) line = lstm.prepare_line(line, pad) pred = network.predictString(line) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(network.outputs) - 2 * pad) result = lstm.translate_back_locations(network.outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): pos.append((coords[0] + int( (start - pad) * scale), coords[1], coords[0] + int( (end - pad / 2) * scale), coords[3])) else: pos.append((coords[0], coords[1] + int( (start - pad) * scale), coords[2], coords[1] + int( (end - pad / 2) * scale))) conf.append(c) if bidi_reordering: yield bidi_record(ocr_record(pred, pos, conf)) else: yield ocr_record(pred, pos, conf)
def mm_rpred(nets, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Multi-model version of kraken.rpred.rpred. Takes a dictionary of ISO15924 script identifiers->models and an script-annotated segmentation to dynamically select appropriate models for these lines. Args: nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer objects. Recommended to be an defaultdict. im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of lists of coordinates (script, (x0, y0, x1, y1)) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ for line in bounds['boxes']: rec = ocr_record('', [], []) for script, (box, coords) in zip( map(lambda x: x[0], line), extract_boxes( im, { 'text_direction': bounds['text_direction'], 'boxes': map(lambda x: x[1], line) })): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: lnorm = getattr(nets[script], 'lnorm', CenterNormalizer()) box = dewarp(lnorm, box) except Exception as e: continue line = pil2array(box) line = lstm.prepare_line(line, pad) pred = nets[script].predictString(line) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad) result = lstm.translate_back_locations(nets[script].outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): pos.append((coords[0] + int( (start - pad) * scale), coords[1], coords[0] + int( (end - pad / 2) * scale), coords[3])) else: pos.append((coords[0], coords[1] + int( (start - pad) * scale), coords[2], coords[1] + int( (end - pad / 2) * scale))) conf.append(c) rec.prediction += pred rec.cuts.extend(pos) rec.confidences.extend(conf) if bidi_reordering: yield bidi_record(rec) else: yield rec
def rpred(network, im, bounds, pad=16, line_normalization=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (iterable): An iterable returning a tuple defining the absolute coordinates (x0, y0, x1, y1) of a text line in the Image. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. Yields: A tuple containing the recognized text (0), absolute character positions in the image (1), and confidence values for each character(2). """ lnorm = getattr(network, "lnorm", CenterNormalizer()) for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == False or coords[3] - coords[1] == False: yield ocr_record("", [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): yield ocr_record("", [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: yield ocr_record("", [], []) continue line = pil2array(box) line = lstm.prepare_line(line, pad) pred = network.predictString(line) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(network.outputs) - 2 * pad) result = lstm.translate_back(network.outputs, pos=1) conf = [network.outputs[r, c] for r, c in result if c != 0] cuts = [(int((r - pad) * scale), c) for (r, c) in result] # append last offset to end of line cuts.append((coords[2] - coords[0], 0)) pos = [] lx = 0 for i, d in enumerate(cuts): if d[1] == 0: lx = d[0] continue try: pos.append((coords[0] + lx, coords[1], coords[0] + d[0], coords[3])) except: break lx = d[0] yield ocr_record(pred, pos, conf)
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ im_str = get_im_str(im) logger.info(u'Running recognizer on {} with {} lines'.format( im_str, len(bounds['boxes']))) logger.debug(u'Loading line normalizer') lnorm = getattr(network, 'lnorm', CenterNormalizer()) if not is_bitonal(im): logger.info(u'Image is grayscale. Adjusting normalizer parameters') lnorm.range = 2 for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: logger.warning( u'bbox {} with zero dimension. Emitting empty record.'.format( coords)) yield ocr_record('', [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): logger.warning( u'Empty line {}. Emitting empty record.'.format(coords)) yield ocr_record('', [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: logger.warning( u'Dewarping for bbox {} failed. Emitting empty record.'. format(coords)) yield ocr_record('', [], []) continue line = pil2array(box) logger.debug(u'Preparing line.') line = lstm.prepare_line(line, pad) logger.debug(u'Performing forward pass.') pred = network.predictString(line) logger.info(u'Prediction: {}'.format(pred)) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(network.outputs) - 2 * pad) logger.debug(u'Extracting labels.') result = lstm.translate_back_locations(network.outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): xmin = coords[0] + int(max((start - pad) * scale, 0)) xmax = coords[0] + max( int(min((end - pad) * scale, coords[2] - coords[0])), 1) pos.append((xmin, coords[1], xmax, coords[3])) else: ymin = coords[1] + int(max((start - pad) * scale, 0)) ymax = coords[1] + max( int(min((end - pad) * scale, coords[3] - coords[1])), 1) pos.append((coords[0], ymin, coords[2], ymax)) conf.append(c) if bidi_reordering: logger.debug(u'BiDi reordering record.') yield bidi_record(ocr_record(pred, pos, conf)) else: yield ocr_record(pred, pos, conf)
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (iterable): An iterable returning a tuple defining the absolute coordinates (x0, y0, x1, y1) of a text line in the Image. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ lnorm = getattr(network, 'lnorm', CenterNormalizer()) for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == False or coords[3] - coords[1] == False: yield ocr_record('', [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): yield ocr_record('', [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: yield ocr_record('', [], []) continue line = pil2array(box) line = lstm.prepare_line(line, pad) pred = network.predictString(line) # calculate recognized LSTM locations of characters scale = len(raw_line.T)/(len(network.outputs)-2 * pad) result = lstm.translate_back_locations(network.outputs) pos = [] conf = [] for _, start, end, c in result: pos.append((coords[0] + int((start-pad)*scale), coords[1], coords[0] + int((end-pad/2)*scale), coords[3])) conf.append(c) if bidi_reordering: yield bidi_record(ocr_record(pred, pos, conf)) else: yield ocr_record(pred, pos, conf)