def generate_input_transforms(batch: int, height: int, width: int, channels: int, pad: int) -> transforms.Compose: """ Generates a torchvision transformation converting a PIL.Image into a tensor usable in a network forward pass. Args: batch (int): mini-batch size height (int): height of input image in pixels width (int): width of input image in pixels channels (int): color channels of input pad (int): Amount of padding on horizontal ends of image Returns: A torchvision transformation composition converting the input image to the appropriate tensor. """ scale = 0 # type: Union[Tuple[int, int], int] if height == 1 and width == 0 and channels > 3: perm = (1, 0, 2) scale = channels mode = 'L' # arbitrary (or fixed) height and width and channels 1 or 3 => needs a # summarizing network (or a not yet implemented scale operation) to move # height to the channel dimension. elif height > 1 and width == 0 and channels in (1, 3): perm = (0, 1, 2) scale = height mode = 'RGB' if channels == 3 else 'L' # fixed height and width image => bicubic scaling of the input image, disable padding elif height > 0 and width > 0 and channels in (1, 3): perm = (0, 1, 2) pad = 0 scale = (height, width) mode = 'RGB' if channels == 3 else 'L' elif height == 0 and width == 0 and channels in (1, 3): perm = (0, 1, 2) pad = 0 scale = 0 mode = 'RGB' if channels == 3 else 'L' else: raise KrakenInputException('Invalid input spec (variable height and fixed width not supported)') out_transforms = [] out_transforms.append(transforms.Lambda(lambda x: x.convert(mode))) if scale: if isinstance(scale, int): if mode not in ['1', 'L']: raise KrakenInputException('Invalid mode {} for line dewarping'.format(mode)) lnorm = CenterNormalizer(scale) out_transforms.append(transforms.Lambda(lambda x: dewarp(lnorm, x))) out_transforms.append(transforms.Lambda(lambda x: x.convert(mode))) elif isinstance(scale, tuple): out_transforms.append(transforms.Resize(scale, Image.LANCZOS)) if pad: out_transforms.append(transforms.Pad((pad, 0), fill=255)) out_transforms.append(transforms.ToTensor()) # invert out_transforms.append(transforms.Lambda(lambda x: x.max() - x)) out_transforms.append(transforms.Lambda(lambda x: x.permute(*perm))) return transforms.Compose(out_transforms)
def __init__(self, images=None, split=lambda x: os.path.splitext(x)[0], suffix='.gt.txt', normalization=None, reorder=True, partition=0.9, pad=16): """ Reads a list of image-text pairs and creates a ground truth set. Args: images (list): List of file paths of line images split (func): Function for generating the base name without extensions from paths suffix (str): Suffix to attach to image base name for text retrieval normalization (str): Unicode normalization for gt reorder (bool): Whether to rearrange code points in "display"/LTR order partition (float): Ground truth data partition ratio between train/test set. pad (int): Padding to add to images left and right """ self.lnorm = CenterNormalizer() self.training_set = [] self.test_set = [] self.alphabet = set() if not images: return for line in images: self.add(line, split, suffix, normalization, reorder, pad) self.repartition(partition) self.alphabet = sorted(set(''.join(t for _, t in self.training_set)))
def generate_input_transforms(batch: int, height: int, width: int, channels: int, pad: int, valid_norm: bool = True, force_binarization=False) -> transforms.Compose: """ Generates a torchvision transformation converting a PIL.Image into a tensor usable in a network forward pass. Args: batch (int): mini-batch size height (int): height of input image in pixels width (int): width of input image in pixels channels (int): color channels of input pad (int): Amount of padding on horizontal ends of image valid_norm (bool): Enables/disables baseline normalization as a valid preprocessing step. If disabled we will fall back to standard scaling. force_binarization (bool): Forces binarization of input images using the nlbin algorithm. Returns: A torchvision transformation composition converting the input image to the appropriate tensor. """ scale = (height, width) # type: Tuple[int, int] center_norm = False mode = 'RGB' if channels == 3 else 'L' if height == 1 and width == 0 and channels > 3: perm = (1, 0, 2) scale = (channels, 0) if valid_norm: center_norm = True mode = 'L' elif height > 1 and width == 0 and channels in (1, 3): perm = (0, 1, 2) if valid_norm and channels == 1: center_norm = True elif height == 0 and width > 1 and channels in (1, 3): perm = (0, 1, 2) # fixed height and width image => bicubic scaling of the input image, disable padding elif height > 0 and width > 0 and channels in (1, 3): perm = (0, 1, 2) pad = 0 elif height == 0 and width == 0 and channels in (1, 3): perm = (0, 1, 2) pad = 0 else: raise KrakenInputException( 'Invalid input spec {}, {}, {}, {}, {}'.format( batch, height, width, channels, pad)) if mode != 'L' and force_binarization: raise KrakenInputException( 'Invalid input spec {}, {}, {}, {} in' ' combination with forced binarization.'.format( batch, height, width, channels, pad)) out_transforms = [] out_transforms.append(transforms.Lambda(lambda x: x.convert(mode))) if force_binarization: out_transforms.append(transforms.Lambda(lambda x: nlbin(im))) # dummy transforms to ensure we can determine color mode of input material # from first two transforms. It's stupid but it works. out_transforms.append(transforms.Lambda(lambda x: x)) if scale != (0, 0): if center_norm: lnorm = CenterNormalizer(scale[0]) out_transforms.append( transforms.Lambda(lambda x: dewarp(lnorm, x))) out_transforms.append(transforms.Lambda(lambda x: x.convert(mode))) else: out_transforms.append( transforms.Lambda( lambda x: _fixed_resize(x, scale, Image.LANCZOS))) if pad: out_transforms.append(transforms.Pad((pad, 0), fill=255)) out_transforms.append(transforms.ToTensor()) # invert out_transforms.append(transforms.Lambda(lambda x: x.max() - x)) out_transforms.append(transforms.Lambda(lambda x: x.permute(*perm))) return transforms.Compose(out_transforms)
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ lnorm = getattr(network, 'lnorm', CenterNormalizer()) for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: yield ocr_record('', [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): yield ocr_record('', [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: yield ocr_record('', [], []) continue line = pil2array(box) line = lstm.prepare_line(line, pad) pred = network.predictString(line) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(network.outputs) - 2 * pad) result = lstm.translate_back_locations(network.outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): pos.append((coords[0] + int( (start - pad) * scale), coords[1], coords[0] + int( (end - pad / 2) * scale), coords[3])) else: pos.append((coords[0], coords[1] + int( (start - pad) * scale), coords[2], coords[1] + int( (end - pad / 2) * scale))) conf.append(c) if bidi_reordering: yield bidi_record(ocr_record(pred, pos, conf)) else: yield ocr_record(pred, pos, conf)
def mm_rpred(nets, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Multi-model version of kraken.rpred.rpred. Takes a dictionary of ISO15924 script identifiers->models and an script-annotated segmentation to dynamically select appropriate models for these lines. Args: nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer objects. Recommended to be an defaultdict. im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of lists of coordinates (script, (x0, y0, x1, y1)) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ for line in bounds['boxes']: rec = ocr_record('', [], []) for script, (box, coords) in zip( map(lambda x: x[0], line), extract_boxes( im, { 'text_direction': bounds['text_direction'], 'boxes': map(lambda x: x[1], line) })): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: lnorm = getattr(nets[script], 'lnorm', CenterNormalizer()) box = dewarp(lnorm, box) except Exception as e: continue line = pil2array(box) line = lstm.prepare_line(line, pad) pred = nets[script].predictString(line) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad) result = lstm.translate_back_locations(nets[script].outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): pos.append((coords[0] + int( (start - pad) * scale), coords[1], coords[0] + int( (end - pad / 2) * scale), coords[3])) else: pos.append((coords[0], coords[1] + int( (start - pad) * scale), coords[2], coords[1] + int( (end - pad / 2) * scale))) conf.append(c) rec.prediction += pred rec.cuts.extend(pos) rec.confidences.extend(conf) if bidi_reordering: yield bidi_record(rec) else: yield rec
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ im_str = get_im_str(im) logger.info(u'Running recognizer on {} with {} lines'.format( im_str, len(bounds['boxes']))) logger.debug(u'Loading line normalizer') lnorm = getattr(network, 'lnorm', CenterNormalizer()) if not is_bitonal(im): logger.info(u'Image is grayscale. Adjusting normalizer parameters') lnorm.range = 2 for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: logger.warning( u'bbox {} with zero dimension. Emitting empty record.'.format( coords)) yield ocr_record('', [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): logger.warning( u'Empty line {}. Emitting empty record.'.format(coords)) yield ocr_record('', [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: logger.warning( u'Dewarping for bbox {} failed. Emitting empty record.'. format(coords)) yield ocr_record('', [], []) continue line = pil2array(box) logger.debug(u'Preparing line.') line = lstm.prepare_line(line, pad) logger.debug(u'Performing forward pass.') pred = network.predictString(line) logger.info(u'Prediction: {}'.format(pred)) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(network.outputs) - 2 * pad) logger.debug(u'Extracting labels.') result = lstm.translate_back_locations(network.outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): xmin = coords[0] + int(max((start - pad) * scale, 0)) xmax = coords[0] + max( int(min((end - pad) * scale, coords[2] - coords[0])), 1) pos.append((xmin, coords[1], xmax, coords[3])) else: ymin = coords[1] + int(max((start - pad) * scale, 0)) ymax = coords[1] + max( int(min((end - pad) * scale, coords[3] - coords[1])), 1) pos.append((coords[0], ymin, coords[2], ymax)) conf.append(c) if bidi_reordering: logger.debug(u'BiDi reordering record.') yield bidi_record(ocr_record(pred, pos, conf)) else: yield ocr_record(pred, pos, conf)