def compute_segmentation_map(im, mask: Optional[np.array] = None, model=None, device: str = 'cpu'): """ """ im_str = get_im_str(im) logger.info(f'Segmenting {im_str}') if model.input[ 1] == 1 and model.one_channel_mode == '1' and not is_bitonal(im): logger.warning('Running binary model on non-binary input image ' '(mode {}). This will result in severely degraded ' 'performance'.format(im.mode)) model.eval() model.to(device) if mask: if mask.mode != '1' and not is_bitonal(mask): logger.error('Mask is not bitonal') raise KrakenInputException('Mask is not bitonal') mask = mask.convert('1') if mask.size != im.size: logger.error( 'Mask size {mask.size} doesn\'t match image size {im.size}') raise KrakenInputException( 'Mask size {mask.size} doesn\'t match image size {im.size}') logger.info('Masking enabled in segmenter.') mask = pil2array(mask) batch, channels, height, width = model.input transforms = dataset.generate_input_transforms(batch, height, width, channels, 0, valid_norm=False) res_tf = tf.Compose(transforms.transforms[:3]) scal_im = res_tf(im).convert('L') with torch.no_grad(): logger.debug('Running network forward pass') o, _ = model.nn(transforms(im).unsqueeze(0).to(device)) logger.debug('Upsampling network output') o = F.interpolate(o, size=scal_im.size[::-1]) o = o.squeeze().cpu().numpy() scale = np.divide(im.size, o.shape[:0:-1]) bounding_regions = model.user_metadata[ 'bounding_regions'] if 'bounding_regions' in model.user_metadata else None return { 'heatmap': o, 'cls_map': model.user_metadata['class_mapping'], 'bounding_regions': bounding_regions, 'scale': scale, 'scal_im': scal_im }
def add(self, image: Union[str, Image.Image], *args, **kwargs) -> None: """ Adds a line-image-text pair to the dataset. Args: image (str): Input image path """ with open(self.split(image), 'r', encoding='utf-8') as fp: gt = fp.read().strip('\n\r') for func in self.text_transforms: gt = func(gt) if not gt: raise KrakenInputException(f'Text line is empty ({fp.name})') if self.preload: try: im = Image.open(image) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException( f'Image transforms failed on {image}') self._images.append(im) else: self._images.append(image) self._gt.append(gt) self.alphabet.update(gt)
def transform(self, image, target): orig_size = image.size image = self.head_transforms(image) if not is_bitonal(image): self.im_mode = image.mode image = self.tail_transforms(image) scale = image.shape[2] / orig_size[0] t = torch.zeros((self.num_classes, ) + image.shape[1:]) start_sep_cls = self.class_mapping['aux']['_start_separator'] end_sep_cls = self.class_mapping['aux']['_end_separator'] for key, lines in target['baselines'].items(): cls_idx = self.class_mapping['baselines'][key] for line in lines: # buffer out line to desired width line = [k for k, g in groupby(line)] line = np.array(line) * scale shp_line = geom.LineString(line) split_offset = min(5, shp_line.length / 2) line_pol = np.array(shp_line.buffer(self.line_width / 2, cap_style=2).boundary, dtype=np.int) rr, cc = polygon(line_pol[:, 1], line_pol[:, 0], shape=image.shape[1:]) t[cls_idx, rr, cc] = 1 split_pt = shp_line.interpolate(split_offset).buffer(0.001) start_sep = np.array((split(shp_line, split_pt)[0].buffer( self.line_width, cap_style=3).boundary), dtype=np.int) rr_s, cc_s = polygon(start_sep[:, 1], start_sep[:, 0], shape=image.shape[1:]) t[start_sep_cls, rr_s, cc_s] = 1 t[start_sep_cls, rr, cc] = 0 split_pt = shp_line.interpolate(-split_offset).buffer(0.001) end_sep = np.array((split(shp_line, split_pt)[-1].buffer( self.line_width, cap_style=3).boundary), dtype=np.int) rr_s, cc_s = polygon(end_sep[:, 1], end_sep[:, 0], shape=image.shape[1:]) t[end_sep_cls, rr_s, cc_s] = 1 t[end_sep_cls, rr, cc] = 0 for key, regions in target['regions'].items(): cls_idx = self.class_mapping['regions'][key] for region in regions: region = np.array(region) * scale rr, cc = polygon(region[:, 1], region[:, 0], shape=image.shape[1:]) t[cls_idx, rr, cc] = 1 target = t if self.aug: image = image.permute(1, 2, 0).numpy() target = target.permute(1, 2, 0).numpy() o = self.aug(image=image, mask=target) image = torch.tensor(o['image']).permute(2, 0, 1) target = torch.tensor(o['mask']).permute(2, 0, 1) return image, target
def transform(self, image, target): orig_size = image.size image = self.head_transforms(image) if not is_bitonal(image): self.im_mode = image.mode image = self.tail_transforms(image) scale = image.shape[2] / orig_size[0] t = torch.zeros((self.num_classes, ) + image.shape[1:]) start_sep_cls = self.class_mapping['aux']['_start_separator'] end_sep_cls = self.class_mapping['aux']['_end_separator'] for key, lines in target['baselines'].items(): cls_idx = self.class_mapping['baselines'][key] for line in lines: # buffer out line to desired width line = [k for k, g in groupby(line)] line = np.array(line) * scale line_pol = np.array(geom.LineString(line).buffer( self.line_width / 2, cap_style=2).boundary, dtype=np.int) rr, cc = polygon(line_pol[:, 1], line_pol[:, 0], shape=image.shape[1:]) t[cls_idx, rr, cc] = 1 start_sep = np.array(geom.LineString( self._get_ortho_line(line[:2], line[0], self.line_width / 2, 'l')).buffer(self.line_width / 2, cap_style=2).boundary, dtype=np.int) rr, cc = polygon(start_sep[:, 1], start_sep[:, 0], shape=image.shape[1:]) t[start_sep_cls, rr, cc] = 1 end_sep = np.array(geom.LineString( self._get_ortho_line(line[-2:], line[-1], self.line_width / 2, 'r')).buffer(self.line_width / 2, cap_style=2).boundary, dtype=np.int) rr, cc = polygon(end_sep[:, 1], end_sep[:, 0], shape=image.shape[1:]) t[end_sep_cls, rr, cc] = 1 for key, regions in target['regions'].items(): cls_idx = self.class_mapping['regions'][key] for region in regions: region = np.array(region) * scale rr, cc = polygon(region[:, 1], region[:, 0], shape=image.shape[1:]) t[cls_idx, rr, cc] = 1 target = t if self.aug: image = image.permute(1, 2, 0).numpy() target = target.permute(1, 2, 0).numpy() o = self.aug(image=image, mask=target) image = torch.tensor(o['image']).permute(2, 0, 1) target = torch.tensor(o['mask']).permute(2, 0, 1) return image, target
def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: x, y = self.training_set[index] if self.aug: im = x.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': y} return {'image': x, 'target': y} else: item = self.training_set[index] try: logger.debug(f'Attempting to load {item[0]}') im = item[0] if not isinstance(im, Image.Image): im = Image.open(im) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) if self.aug: im = im.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': item[1]} except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug(traceback.format_exc()) logger.info(f'Failed. Replacing with sample {idx}') return self[np.random.randint(0, len(self.training_set))]
def _extract_line(xml_record): lines = [] try: im = Image.open(xml_record['image']) except FileNotFoundError: return lines, None, None if is_bitonal(im): im = im.convert('1') line_counts = Counter({'all': 0, 'train': 0, 'validation': 0, 'test': 0}) seg_key = 'lines' if 'lines' in xml_record else 'boxes' recs = xml_record.pop(seg_key) for idx, rec in enumerate(recs): try: line_im, line = next( extract_polygons(im, { **xml_record, seg_key: [rec] })) except KrakenInputException: logger.warning(f'Invalid line {idx} in {im.filename}') continue if not line['text']: continue fp = io.BytesIO() line_im.save(fp, format='png') if line['split']: line_counts[line['split']] += 1 else: line_counts['all'] += 1 lines.append({'text': line['text'], 'im': fp.getvalue()}) return lines, im.mode
def check_output(self, config, im, output_tensor): if config['height'] != 0: self.assertEqual(config['height'], output_tensor.shape[1]) if config['width'] != 0: self.assertEqual(config['width'], output_tensor.shape[2]) if config['force_binarization'] or is_bitonal(im): self.assertEqual(len(output_tensor.int().unique()), 2) if config['channels'] == 3: self.assertEqual(output_tensor.shape[0], 3)
def _extract_path_line(xml_record): try: im = Image.open(xml_record['image']) except FileNotFoundError: return [], None, None if not xml_record['lines'][0]['text']: return [], None, None if is_bitonal(im): im = im.convert('1') fp = io.BytesIO() im.save(fp, format='png') line = {'text': xml_record['lines'][0]['text'], 'im': fp.getvalue()} return [line], im.mode, {'all': 1, 'train': 0, 'validation': 0, 'test': 0}
def add(self, image: Union[str, Image.Image], text: str, baseline: List[Tuple[int, int]], boundary: List[Tuple[int, int]], *args, **kwargs): """ Adds a line to the dataset. Args: im (path): Path to the whole page image text (str): Transcription of the line. baseline (list): A list of coordinates [[x0, y0], ..., [xn, yn]]. boundary (list): A polygon mask for the line. """ for func in self.text_transforms: text = func(text) if not text: raise KrakenInputException( 'Text line is empty after transformations') if not baseline: raise KrakenInputException('No baseline given for line') if not boundary: raise KrakenInputException('No boundary given for line') if self.preload: if not isinstance(image, Image.Image): im = Image.open(image) try: im, _ = next( extract_polygons( im, { 'type': 'baselines', 'lines': [{ 'baseline': baseline, 'boundary': boundary }] })) except IndexError: raise KrakenInputException( 'Patch extraction failed for baseline') try: im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException( f'Image transforms failed on {image}') self._images.append(im) else: self._images.append((image, baseline, boundary)) self._gt.append(text) self.alphabet.update(text)
def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: x, y = self.training_set[index] if self.aug: x = x.permute((1, 2, 0)).numpy() o = self.aug(image=x) x = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': x, 'target': y} else: item = self.training_set[index] try: logger.debug(f'Attempting to load {item[0]}') im = item[0][0] if not isinstance(im, Image.Image): im = Image.open(im) im, _ = next( extract_polygons( im, { 'type': 'baselines', 'lines': [{ 'baseline': item[0][1], 'boundary': item[0][2] }] })) #im.save("kk_polextr.png") # AHT im = self.head_transforms(im) #im.save("kk_head_transforms.png") # AHT if not is_bitonal(im): self.im_mode = im.mode #print(self.im_mode) # AHT #print(self.tail_transforms) # AHT im = self.tail_transforms(im) #print(im.size()) # AHT #save_image(im[0],"kk_tail_transforms.png") # AHT if self.aug: im = im.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': item[1]} except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug(traceback.format_exc()) logger.info(f'Failed. Replacing with sample {idx}') return self[np.random.randint(0, len(self.training_set))]
def transcription(ctx, text_direction, scale, bw, maxcolseps, black_colseps, font, font_style, prefill, output, images): ti = transcribe.TranscriptionInterface(font, font_style) if prefill: logger.info(u'Loading model {}'.format(prefill)) spin('Loading RNN') prefill = models.load_any(prefill) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) for fp in images: logger.info('Reading {}'.format(fp.name)) spin('Reading images') im = Image.open(fp) im_bin = im if not is_bitonal(im): logger.info(u'Binarizing page') im_bin = binarization.nlbin(im) logger.info(u'Segmenting page') if bw: im = im_bin res = pageseg.segment(im_bin, text_direction, scale, maxcolseps, black_colseps) if prefill: it = rpred.rpred(prefill, im_bin, res) preds = [] for pred in it: logger.info(u'{}'.format(pred.prediction)) spin('Recognizing') preds.append(pred) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) ti.add_page(im, res, records=preds) else: ti.add_page(im, res) fp.close() message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) logger.info('Writing transcription to {}'.format(output.name)) spin('Writing output') ti.write(output) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False)
def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: x, y = self.training_set[index] if self.aug: x = x.permute((1, 2, 0)).numpy() o = self.aug(image=x) x = torch.tensor(o['image'].transpose(2, 0, 1)) return x, y else: item = self.training_set[index] try: logger.debug('Attempting to load {}'.format(item[0])) im = item[0][0] if not isinstance(im, Image.Image): im = Image.open(im) im, _ = next( extract_polygons( im, { 'type': 'baselines', 'lines': [{ 'baseline': item[0][1], 'boundary': item[0][2] }] })) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) if self.aug: im = im.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return (im, item[1]) except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug('Failed. Replacing with sample {}'.format(idx)) return self[np.random.randint(0, len(self.training_set))]
def add_loaded(self, image: Image.Image, gt: str) -> None: """ Adds an already loaded line-image-text pair to the dataset. Args: image (PIL.Image.Image): Line image gt (str): Text contained in the line image """ if self.preload: try: im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException(f'Image transforms failed on {image}') self._images.append(im) else: self._images.append(image) for func in self.text_transforms: gt = func(gt) self._gt.append(gt) self.alphabet.update(gt)
def segment(im, text_direction='horizontal-lr', scale=None, maxcolseps=2, black_colseps=False, no_hlines=True, pad=0, mask=None): """ Segments a page into text lines. Segments a page into text lines and returns the absolute coordinates of each line in reading order. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' text_direction (str): Principal direction of the text (horizontal-lr/rl/vertical-lr/rl) scale (float): Scale of the image maxcolseps (int): Maximum number of whitespace column separators black_colseps (bool): Whether column separators are assumed to be vertical black lines or not no_hlines (bool): Switch for horizontal line removal pad (int or tuple): Padding to add to line bounding boxes. If int the same padding is used both left and right. If a 2-tuple, uses (padding_left, padding_right). mask (PIL.Image): A bi-level mask image of the same size as `im` where 0-valued regions are ignored for segmentation purposes. Disables column detection. Returns: {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A dictionary containing the text direction and a list of reading order sorted bounding boxes under the key 'boxes'. Raises: KrakenInputException if the input image is not binarized or the text direction is invalid. """ im_str = get_im_str(im) logger.info('Segmenting {}'.format(im_str)) if im.mode != '1' and not is_bitonal(im): logger.error('Image {} is not bi-level'.format(im_str)) raise KrakenInputException('Image {} is not bi-level'.format(im_str)) # rotate input image for vertical lines if text_direction.startswith('horizontal'): angle = 0 offset = (0, 0) elif text_direction == 'vertical-lr': angle = 270 offset = (0, im.size[1]) elif text_direction == 'vertical-rl': angle = 90 offset = (im.size[0], 0) else: logger.error('Invalid text direction \'{}\''.format(text_direction)) raise KrakenInputException('Invalid text direction {}'.format(text_direction)) logger.debug('Rotating input image by {} degrees'.format(angle)) im = im.rotate(angle, expand=True) # honestly I've got no idea what's going on here. In theory a simple # np.array(im, 'i') should suffice here but for some reason the # tostring/fromstring magic in pil2array alters the array in a way that is # needed for the algorithm to work correctly. a = pil2array(im) binary = np.array(a > 0.5*(np.amin(a) + np.amax(a)), 'i') binary = 1 - binary if not scale: scale = estimate_scale(binary) if no_hlines: binary = remove_hlines(binary, scale) # emptyish images wll cause exceptions here. try: if mask: if mask.mode != '1' and not is_bitonal(mask): logger.error('Mask is not bitonal') raise KrakenInputException('Mask is not bitonal') mask = mask.convert('1') if mask.size != im.size: logger.error('Mask size {} doesn\'t match image size {}'.format(mask.size, im.size)) raise KrakenInputException('Mask size {} doesn\'t match image size {}'.format(mask.size, im.size)) logger.info('Masking enabled in segmenter. Disabling column detection.') mask = mask.rotate(angle, expand=True) colseps = pil2array(mask) elif black_colseps: colseps, binary = compute_black_colseps(binary, scale, maxcolseps) else: colseps = compute_white_colseps(binary, scale, maxcolseps) except ValueError: logger.warning('Exception in column finder (probably empty image) for {}.'.format(im_str)) return {'text_direction': text_direction, 'boxes': []} bottom, top, boxmap = compute_gradmaps(binary, scale) seeds = compute_line_seeds(binary, bottom, top, colseps, scale) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = morph.spread_labels(seeds, maxdist=scale) llabels = np.where(llabels > 0, llabels, spread*binary) segmentation = llabels*binary lines = compute_lines(segmentation, scale) order = reading_order([l.bounds for l in lines], text_direction[-2:]) lsort = topsort(order) lines = [lines[i].bounds for i in lsort] lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines] if isinstance(pad, int): pad = (pad, pad) lines = [(max(x[0]-pad[0], 0), x[1], min(x[2]+pad[1], im.size[0]), x[3]) for x in lines] return {'text_direction': text_direction, 'boxes': rotate_lines(lines, 360-angle, offset).tolist(), 'script_detection': False}
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True): """ Uses a RNN to recognize text Args: network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ im_str = get_im_str(im) logger.info(u'Running recognizer on {} with {} lines'.format( im_str, len(bounds['boxes']))) logger.debug(u'Loading line normalizer') lnorm = getattr(network, 'lnorm', CenterNormalizer()) if not is_bitonal(im): logger.info(u'Image is grayscale. Adjusting normalizer parameters') lnorm.range = 2 for box, coords in extract_boxes(im, bounds): # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: logger.warning( u'bbox {} with zero dimension. Emitting empty record.'.format( coords)) yield ocr_record('', [], []) continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): logger.warning( u'Empty line {}. Emitting empty record.'.format(coords)) yield ocr_record('', [], []) continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: box = dewarp(lnorm, box) except: logger.warning( u'Dewarping for bbox {} failed. Emitting empty record.'. format(coords)) yield ocr_record('', [], []) continue line = pil2array(box) logger.debug(u'Preparing line.') line = lstm.prepare_line(line, pad) logger.debug(u'Performing forward pass.') pred = network.predictString(line) logger.info(u'Prediction: {}'.format(pred)) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(network.outputs) - 2 * pad) logger.debug(u'Extracting labels.') result = lstm.translate_back_locations(network.outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): xmin = coords[0] + int(max((start - pad) * scale, 0)) xmax = coords[0] + max( int(min((end - pad) * scale, coords[2] - coords[0])), 1) pos.append((xmin, coords[1], xmax, coords[3])) else: ymin = coords[1] + int(max((start - pad) * scale, 0)) ymax = coords[1] + max( int(min((end - pad) * scale, coords[3] - coords[1])), 1) pos.append((coords[0], ymin, coords[2], ymax)) conf.append(c) if bidi_reordering: logger.debug(u'BiDi reordering record.') yield bidi_record(ocr_record(pred, pos, conf)) else: yield ocr_record(pred, pos, conf)
def segment(im, text_direction='horizontal-lr', scale=None, maxcolseps=2, black_colseps=False, no_hlines=True, pad=0): """ Segments a page into text lines. Segments a page into text lines and returns the absolute coordinates of each line in reading order. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' text_direction (str): Principal direction of the text (horizontal-lr/rl/vertical-lr/rl) scale (float): Scale of the image maxcolseps (int): Maximum number of whitespace column separators black_colseps (bool): Whether column separators are assumed to be vertical black lines or not no_hlines (bool): Switch for horizontal line removal pad (int or tuple): Padding to add to line bounding boxes. If int the same padding is used both left and right. If a 2-tuple, uses (padding_left, padding_right). Returns: {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A dictionary containing the text direction and a list of reading order sorted bounding boxes under the key 'boxes'. Raises: KrakenInputException if the input image is not binarized or the text direction is invalid. """ im_str = get_im_str(im) logger.info(u'Segmenting {}'.format(im_str)) if im.mode != '1' and not is_bitonal(im): logger.error(u'Image {} is not bi-level'.format(im_str)) raise KrakenInputException('Image {} is not bi-level'.format(im_str)) # rotate input image for vertical lines if text_direction.startswith('horizontal'): angle = 0 offset = (0, 0) elif text_direction == 'vertical-lr': angle = 270 offset = (0, im.size[1]) elif text_direction == 'vertical-rl': angle = 90 offset = (im.size[0], 0) else: logger.error(u'Invalid text direction \'{}\''.format(text_direction)) raise KrakenInputException( 'Invalid text direction {}'.format(text_direction)) logger.debug(u'Rotating input image by {} degrees'.format(angle)) im = im.rotate(angle, expand=True) # honestly I've got no idea what's going on here. In theory a simple # np.array(im, 'i') should suffice here but for some reason the # tostring/fromstring magic in pil2array alters the array in a way that is # needed for the algorithm to work correctly. a = pil2array(im) binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i') binary = 1 - binary if not scale: scale = estimate_scale(binary) if no_hlines: binary = remove_hlines(binary, scale) # emptyish images wll cause exceptions here. try: if black_colseps: colseps, binary = compute_black_colseps(binary, scale, maxcolseps) else: colseps = compute_white_colseps(binary, scale, maxcolseps) except ValueError: logger.warning( u'Exception in column finder (probably empty image) for {}.'. format(im_str)) return {'text_direction': text_direction, 'boxes': []} bottom, top, boxmap = compute_gradmaps(binary, scale) seeds = compute_line_seeds(binary, bottom, top, colseps, scale) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = morph.spread_labels(seeds, maxdist=scale) llabels = np.where(llabels > 0, llabels, spread * binary) segmentation = llabels * binary lines = compute_lines(segmentation, scale) order = reading_order([l.bounds for l in lines], text_direction[-2:]) lsort = topsort(order) lines = [lines[i].bounds for i in lsort] lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines] if isinstance(pad, int): pad = (pad, pad) lines = [(max(x[0] - pad[0], 0), x[1], min(x[2] + pad[1], im.size[0]), x[3]) for x in lines] return { 'text_direction': text_direction, 'boxes': rotate_lines(lines, 360 - angle, offset).tolist(), 'script_detection': False }
def compute_segmentation_map(im: PIL.Image.Image, mask: Optional[np.ndarray] = None, model: vgsl.TorchVGSLModel = None, device: str = 'cpu') -> Dict[str, Any]: """ Args: im: Input image mask: A bi-level mask array of the same size as `im` where 0-valued regions are ignored for segmentation purposes. Disables column detection. model: A TorchVGSLModel containing a segmentation model. device: The target device to run the neural network on. Returns: A dictionary containing the heatmaps ('heatmap', torch.Tensor), class map ('cls_map', Dict[str, Dict[str, int]]), the bounding regions for polygonization purposes ('bounding_regions', List[str]), the scale between the input image and the network output ('scale', float), and the scaled input image to the network ('scal_im', PIL.Image.Image). Raises: KrakenInputException: When given an invalid mask. """ im_str = get_im_str(im) logger.info(f'Segmenting {im_str}') if model.input[ 1] == 1 and model.one_channel_mode == '1' and not is_bitonal(im): logger.warning('Running binary model on non-binary input image ' '(mode {}). This will result in severely degraded ' 'performance'.format(im.mode)) model.eval() model.to(device) batch, channels, height, width = model.input transforms = dataset.ImageInputTransforms(batch, height, width, channels, 0, valid_norm=False) tf_idx, _ = next( filter(lambda x: isinstance(x[1], tf.ToTensor), enumerate(transforms.transforms))) res_tf = tf.Compose(transforms.transforms[:tf_idx]) scal_im = np.array(res_tf(im).convert('L')) tensor_im = transforms(im) if mask: if mask.mode != '1' and not is_bitonal(mask): logger.error('Mask is not bitonal') raise KrakenInputException('Mask is not bitonal') mask = mask.convert('1') if mask.size != im.size: logger.error( 'Mask size {mask.size} doesn\'t match image size {im.size}') raise KrakenInputException( 'Mask size {mask.size} doesn\'t match image size {im.size}') logger.info('Masking enabled in segmenter.') tensor_im[~transforms(mask).bool()] = 0 with torch.no_grad(): logger.debug('Running network forward pass') o, _ = model.nn(tensor_im.unsqueeze(0).to(device)) logger.debug('Upsampling network output') o = F.interpolate(o, size=scal_im.shape) o = o.squeeze().cpu().numpy() scale = np.divide(im.size, o.shape[:0:-1]) bounding_regions = model.user_metadata[ 'bounding_regions'] if 'bounding_regions' in model.user_metadata else None return { 'heatmap': o, 'cls_map': model.user_metadata['class_mapping'], 'bounding_regions': bounding_regions, 'scale': scale, 'scal_im': scal_im }
def segment(im, text_direction: str = 'horizontal-lr', mask: Optional[np.array] = None, reading_order_fn: Callable = polygonal_reading_order, model=None, device: str = 'cpu'): """ Segments a page into text lines using the baseline segmenter. Segments a page into text lines and returns the polyline formed by each baseline and their estimated environment. Args: im (PIL.Image): An RGB image. text_direction (str): Ignored by the segmenter but kept for serialization. mask (PIL.Image): A bi-level mask image of the same size as `im` where 0-valued regions are ignored for segmentation purposes. Disables column detection. reading_order_fn (function): Function to determine the reading order. Has to accept a list of tuples (baselines, polygon) and a text direction (`lr` or `rl`). model (vgsl.TorchVGSLModel): A TorchVGSLModel containing a segmentation model. If none is given a default model will be loaded. device (str or torch.Device): The target device to run the neural network on. Returns: {'text_direction': '$dir', 'type': 'baseline', 'lines': [ {'baseline': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'boundary': [[x0, y0, x1, y1], ... [x_m, y_m]]}, {'baseline': [[x0, ...]], 'boundary': [[x0, ...]]} ] 'regions': [ {'region': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'type': 'image'}, {'region': [[x0, ...]], 'type': 'text'} ] }: A dictionary containing the text direction and under the key 'lines' a list of reading order sorted baselines (polylines) and their respective polygonal boundaries. The last and first point of each boundary polygon is connected. Raises: KrakenInputException if the input image is not binarized or the text direction is invalid. """ im_str = get_im_str(im) logger.info(f'Segmenting {im_str}') if model is None: logger.info('No segmentation model given. Loading default model.') model = vgsl.TorchVGSLModel.load_model(pkg_resources.resource_filename(__name__, 'blla.mlmodel')) if model.one_channel_mode == '1' and not is_bitonal(im): logger.warning('Running binary model on non-binary input image ' '(mode {}). This will result in severely degraded ' 'performance'.format(im.mode)) model.eval() model.to(device) if mask: if mask.mode != '1' and not is_bitonal(mask): logger.error('Mask is not bitonal') raise KrakenInputException('Mask is not bitonal') mask = mask.convert('1') if mask.size != im.size: logger.error('Mask size {mask.size} doesn\'t match image size {im.size}') raise KrakenInputException('Mask size {mask.size} doesn\'t match image size {im.size}') logger.info('Masking enabled in segmenter.') mask = pil2array(mask) batch, channels, height, width = model.input transforms = dataset.generate_input_transforms(batch, height, width, channels, 0, valid_norm=False) res_tf = tf.Compose(transforms.transforms[:3]) scal_im = res_tf(im).convert('L') with torch.no_grad(): logger.debug('Running network forward pass') o = model.nn(transforms(im).unsqueeze(0).to(device)) logger.debug('Upsampling network output') o = F.interpolate(o, size=scal_im.size[::-1]) o = o.squeeze().cpu().numpy() scale = np.divide(im.size, o.shape[:0:-1]) # postprocessing cls_map = model.user_metadata['class_mapping'] st_sep = cls_map['aux']['_start_separator'] end_sep = cls_map['aux']['_end_separator'] logger.info('Vectorizing baselines') baselines = [] regions = {} for bl_type, idx in cls_map['baselines'].items(): logger.debug(f'Vectorizing lines of type {bl_type}') baselines.extend([(bl_type,x) for x in vectorize_lines(o[(st_sep, end_sep, idx), :, :])]) logger.info('Vectorizing regions') for region_type, idx in cls_map['regions'].items(): logger.debug(f'Vectorizing lines of type {bl_type}') regions[region_type] = vectorize_regions(o[idx]) logger.debug('Polygonizing lines') lines = list(filter(lambda x: x[2] is not None, zip([x[0] for x in baselines], [x[1] for x in baselines], calculate_polygonal_environment(scal_im, [x[1] for x in baselines])))) logger.debug('Scaling vectorized lines') sc = scale_polygonal_lines([x[1:] for x in lines], scale) lines = list(zip([x[0] for x in lines], [x[0] for x in sc], [x[1] for x in sc])) logger.debug('Scaling vectorized regions') for reg_id, regs in regions.items(): regions[reg_id] = scale_regions(regs, scale) logger.debug('Reordering baselines') order_regs = [] for regs in regions.values(): order_regs.extend(regs) lines = reading_order_fn(lines=lines, regions=order_regs, text_direction=text_direction[-2:]) if 'class_mapping' in model.user_metadata and len(model.user_metadata['class_mapping']['baselines']) > 1: script_detection = True else: script_detection = False return {'text_direction': text_direction, 'type': 'baselines', 'lines': [{'script': bl_type, 'baseline': bl, 'boundary': pl} for bl_type, bl, pl in lines], 'regions': regions, 'script_detection': script_detection}
def nlbin(im: Image.Image, threshold: float = 0.5, zoom: float = 0.5, escale: float = 1.0, border: float = 0.1, perc: int = 80, range: int = 20, low: int = 5, high: int = 90) -> Image: """ Performs binarization using non-linear processing. Args: im (PIL.Image.Image): threshold (float): zoom (float): Zoom for background page estimation escale (float): Scale for estimating a mask over the text region border (float): Ignore this much of the border perc (int): Percentage for filters range (int): Range for filters low (int): Percentile for black estimation high (int): Percentile for white estimation Returns: PIL.Image containing the binarized image Raises: KrakenInputException when trying to binarize an empty image. """ im_str = get_im_str(im) logger.info(f'Binarizing {im_str}') if is_bitonal(im): logger.info(f'Skipping binarization because {im_str} is bitonal.') return im # convert to grayscale first logger.debug(f'Converting {im_str} to grayscale') im = im.convert('L') raw = pil2array(im) logger.debug('Scaling and normalizing') # rescale image to between -1 or 0 and 1 raw = raw / np.float(np.iinfo(raw.dtype).max) # perform image normalization if np.amax(raw) == np.amin(raw): logger.warning(f'Trying to binarize empty image {im_str}') raise KrakenInputException('Image is empty') image = raw - np.amin(raw) image /= np.amax(image) logger.debug('Interpolation and percentile filtering') with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) m = interpolation.zoom(image, zoom) m = filters.percentile_filter(m, perc, size=(range, 2)) m = filters.percentile_filter(m, perc, size=(2, range)) mh, mw = m.shape oh, ow = image.shape scale = np.diag([mh * 1.0 / oh, mw * 1.0 / ow]) m = affine_transform(m, scale, output_shape=image.shape) w, h = np.minimum(np.array(image.shape), np.array(m.shape)) flat = np.clip(image[:w, :h] - m[:w, :h] + 1, 0, 1) # estimate low and high thresholds d0, d1 = flat.shape o0, o1 = int(border * d0), int(border * d1) est = flat[o0:d0 - o0, o1:d1 - o1] logger.debug('Threshold estimates {}'.format(est)) # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable logger.debug('Refine estimates') v = est - filters.gaussian_filter(est, escale * 20.0) v = filters.gaussian_filter(v**2, escale * 20.0)**0.5 v = (v > 0.3 * np.amax(v)) v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1))) v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50)))) est = est[v] lo = np.percentile(est.ravel(), low) hi = np.percentile(est.ravel(), high) flat -= lo flat /= (hi - lo) flat = np.clip(flat, 0, 1) logger.debug(f'Thresholding at {threshold}') bin = np.array(255 * (flat > threshold), 'B') return array2pil(bin)
def __init__(self, nets: Dict[str, TorchSeqRecognizer], im: Image.Image, bounds: dict, pad: int = 16, bidi_reordering: bool = True, script_ignore: Optional[List[str]] = None) -> Generator[ocr_record, None, None]: """ Multi-model version of kraken.rpred.rpred. Takes a dictionary of ISO15924 script identifiers->models and an script-annotated segmentation to dynamically select appropriate models for these lines. Args: nets (dict): A dict mapping ISO15924 identifiers to TorchSegRecognizer objects. Recommended to be an defaultdict. im (PIL.Image.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of lists of coordinates (script, (x0, y0, x1, y1)) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. script_ignore (list): List of scripts to ignore during recognition Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. Raises: KrakenInputException if the mapping between segmentation scripts and networks is incomplete. """ seg_types = set(recognizer.seg_type for recognizer in nets.values()) if ('type' in bounds and bounds['type'] not in seg_types) or len(seg_types) > 1: logger.warning('Recognizers with segmentation types {} will be ' 'applied to segmentation of type {}. This will likely result ' 'in severely degraded performace'.format(seg_types, bounds['type'] if 'type' in bounds else None)) one_channel_modes = set(recognizer.nn.one_channel_mode for recognizer in nets.values()) if '1' in one_channel_modes and len(one_channel_modes) > 1: raise KrakenInputException('Mixing binary and non-binary recognition models is not supported.') elif '1' in one_channel_modes and not is_bitonal(im): logger.warning('Running binary models on non-binary input image ' '(mode {}). This will result in severely degraded ' 'performance'.format(im.mode)) if 'type' in bounds and bounds['type'] == 'baselines': valid_norm = False self.len = len(bounds['lines']) self.seg_key = 'lines' self.next_iter = self._recognize_baseline_line self.line_iter = iter(bounds['lines']) scripts = [x['script'] for x in bounds['lines']] else: valid_norm = True self.len = len(bounds['boxes']) self.seg_key = 'boxes' self.next_iter = self._recognize_box_line self.line_iter = iter(bounds['boxes']) scripts = [x[0] for line in bounds['boxes'] for x in line] im_str = get_im_str(im) logger.info('Running {} multi-script recognizers on {} with {} lines'.format(len(nets), im_str, self.len)) miss = [script for script in scripts if not nets.get(script)] if miss and not isinstance(nets, defaultdict): raise KrakenInputException('Missing models for scripts {}'.format(set(miss))) # build dictionary for line preprocessing self.ts = {} for script in scripts: logger.debug('Loading line transforms for {}'.format(script)) network = nets[script] batch, channels, height, width = network.nn.input self.ts[script] = generate_input_transforms(batch, height, width, channels, pad, valid_norm) self.im = im self.nets = nets self.bidi_reordering = bidi_reordering self.pad = pad self.bounds = bounds self.script_ignore = script_ignore
def mm_rpred(nets, im, bounds, pad=16, line_normalization=True, bidi_reordering=True, script_ignore=None): """ Multi-model version of kraken.rpred.rpred. Takes a dictionary of ISO15924 script identifiers->models and an script-annotated segmentation to dynamically select appropriate models for these lines. Args: nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer objects. Recommended to be an defaultdict. im (PIL.Image): Image to extract text from bounds (dict): A dictionary containing a 'boxes' entry with a list of lists of coordinates (script, (x0, y0, x1, y1)) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. pad (int): Extra blank padding to the left and right of text line line_normalization (bool): Dewarp line using the line estimator contained in the network. If no normalizer is available one using the default parameters is created. By aware that you may have to scale lines manually to the target line height if disabled. bidi_reordering (bool): Reorder classes in the ocr_record according to the Unicode bidirectional algorithm for correct display. script_ignore (list): List of scripts to ignore during recognition Yields: An ocr_record containing the recognized text, absolute character positions, and confidence values for each character. """ im_str = get_im_str(im) logger.info( u'Running {} multi-script recognizers on {} with {} lines'.format( len(nets), im_str, len(bounds['boxes']))) for line in bounds['boxes']: rec = ocr_record('', [], []) for script, (box, coords) in zip( map(lambda x: x[0], line), extract_boxes( im, { 'text_direction': bounds['text_direction'], 'boxes': map(lambda x: x[1], line) })): # skip if script is set to ignore if script in script_ignore: logger.info(u'Ignoring {} line segment.'.format(script)) continue # check if boxes are non-zero in any dimension if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0: logger.warning(u'Run with zero dimension. Skipping.') continue raw_line = pil2array(box) # check if line is non-zero if np.amax(raw_line) == np.amin(raw_line): logger.warning(u'Empty run. Skipping.') continue if line_normalization: # fail gracefully and return no recognition result in case the # input line can not be normalized. try: lnorm = getattr(nets[script], 'lnorm', CenterNormalizer()) if not is_bitonal(im): logger.info( u'Image is grayscale. Adjusting normalizer parameters' ) lnorm.range = 2 box = dewarp(lnorm, box) except Exception as e: logger.warning( u'Dewarping for bbox {} failed. Skipping.'.format( coords)) continue line = pil2array(box) logger.debug(u'Preparing run.') line = lstm.prepare_line(line, pad) logger.debug(u'Forward pass with model {}'.format(script)) pred = nets[script].predictString(line) logger.info(u'Prediction: {}'.format(pred)) # calculate recognized LSTM locations of characters scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad) logger.debug(u'Extracting labels.') result = lstm.translate_back_locations(nets[script].outputs) pos = [] conf = [] for _, start, end, c in result: if bounds['text_direction'].startswith('horizontal'): pos.append( (coords[0] + int(max(start - pad, 0) * scale), coords[1], coords[0] + int(min(end - pad, coords[2]) * scale), coords[3])) else: pos.append( (coords[0], coords[1] + int(max(start - pad, 0) * scale), coords[2], coords[1] + int(min(end - pad, coords[3]) * scale))) conf.append(c) rec.prediction += pred rec.cuts.extend(pos) rec.confidences.extend(conf) if bidi_reordering: logger.debug(u'BiDi reordering record.') yield bidi_record(rec) else: yield rec
def transcription(ctx, text_direction, scale, bw, maxcolseps, black_colseps, font, font_style, prefill, pad, lines, output, images): from PIL import Image from kraken import rpred from kraken import pageseg from kraken import transcribe from kraken import binarization from kraken.lib import models from kraken.lib.util import is_bitonal ti = transcribe.TranscriptionInterface(font, font_style) if len(images) > 1 and lines: raise click.UsageError( '--lines option is incompatible with multiple image files') if prefill: logger.info('Loading model {}'.format(prefill)) message('Loading RNN', nl=False) prefill = models.load_any(prefill) message('\u2713', fg='green') with log.progressbar(images, label='Reading images') as bar: for fp in bar: logger.info('Reading {}'.format(fp.name)) im = Image.open(fp) im_bin = im if not is_bitonal(im): logger.info('Binarizing page') im_bin = binarization.nlbin(im) logger.info('Segmenting page') if bw: im = im_bin if not lines: res = pageseg.segment(im_bin, text_direction, scale, maxcolseps, black_colseps, pad=pad) else: with open_file(lines, 'r') as fp: try: fp = cast(IO[Any], fp) res = json.load(fp) except ValueError as e: raise click.UsageError( '{} invalid segmentation: {}'.format( lines, str(e))) if prefill: it = rpred.rpred(prefill, im_bin, res) preds = [] logger.info('Recognizing') for pred in it: logger.debug('{}'.format(pred.prediction)) preds.append(pred) ti.add_page(im, res, records=preds) else: ti.add_page(im, res) fp.close() logger.info('Writing transcription to {}'.format(output.name)) message('Writing output', nl=False) ti.write(output) message('\u2713', fg='green')
def segment(im, text_direction: str = 'horizontal-lr', scale: Optional[float] = None, maxcolseps: float = 2, black_colseps: bool = False, no_hlines: bool = True, pad: int = 0, mask: Optional[np.array] = None, reading_order_fn: Callable = reading_order) -> Dict[str, Any]: """ Segments a page into text lines. Segments a page into text lines and returns the absolute coordinates of each line in reading order. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' text_direction (str): Principal direction of the text (horizontal-lr/rl/vertical-lr/rl) scale (float): Scale of the image maxcolseps (int): Maximum number of whitespace column separators black_colseps (bool): Whether column separators are assumed to be vertical black lines or not no_hlines (bool): Switch for horizontal line removal pad (int or tuple): Padding to add to line bounding boxes. If int the same padding is used both left and right. If a 2-tuple, uses (padding_left, padding_right). mask (PIL.Image): A bi-level mask image of the same size as `im` where 0-valued regions are ignored for segmentation purposes. Disables column detection. reading_order_fn (Callable): Function to call to order line output. Callable accepting a list of slices (y, x) and a text direction in (`rl`, `lr`). Returns: {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A dictionary containing the text direction and a list of reading order sorted bounding boxes under the key 'boxes'. Raises: KrakenInputException if the input image is not binarized or the text direction is invalid. """ im_str = get_im_str(im) logger.info(f'Segmenting {im_str}') if im.mode != '1' and not is_bitonal(im): logger.error(f'Image {im_str} is not bi-level') raise KrakenInputException(f'Image {im_str} is not bi-level') # rotate input image for vertical lines if text_direction.startswith('horizontal'): angle = 0 offset = (0, 0) elif text_direction == 'vertical-lr': angle = 270 offset = (0, im.size[1]) elif text_direction == 'vertical-rl': angle = 90 offset = (im.size[0], 0) else: logger.error(f'Invalid text direction \'{text_direction}\'') raise KrakenInputException(f'Invalid text direction {text_direction}') logger.debug(f'Rotating input image by {angle} degrees') im = im.rotate(angle, expand=True) a = pil2array(im) binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i') binary = 1 - binary if not scale: scale = estimate_scale(binary) if no_hlines: binary = remove_hlines(binary, scale) # emptyish images wll cause exceptions here. try: if mask: if mask.mode != '1' and not is_bitonal(mask): logger.error('Mask is not bitonal') raise KrakenInputException('Mask is not bitonal') mask = mask.convert('1') if mask.size != im.size: logger.error( f'Mask size {mask.size} doesn\'t match image size {im.size}' ) raise KrakenInputException( f'Mask size {mask.size} doesn\'t match image size {im.size}' ) logger.info( 'Masking enabled in segmenter. Disabling column detection.') mask = mask.rotate(angle, expand=True) colseps = pil2array(mask) elif black_colseps: colseps, binary = compute_black_colseps(binary, scale, maxcolseps) else: colseps = compute_white_colseps(binary, scale, maxcolseps) except ValueError: logger.warning( f'Exception in column finder (probably empty image) for {im_str}') return {'text_direction': text_direction, 'boxes': []} bottom, top, boxmap = compute_gradmaps(binary, scale) seeds = compute_line_seeds(binary, bottom, top, colseps, scale) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = morph.spread_labels(seeds, maxdist=scale) llabels = np.where(llabels > 0, llabels, spread * binary) segmentation = llabels * binary lines = compute_lines(segmentation, scale) order = reading_order_fn([l.bounds for l in lines], text_direction[-2:]) lsort = topsort(order) lines = [lines[i].bounds for i in lsort] lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines] if isinstance(pad, int): pad = (pad, pad) lines = [(max(x[0] - pad[0], 0), x[1], min(x[2] + pad[1], im.size[0]), x[3]) for x in lines] return { 'text_direction': text_direction, 'boxes': rotate_lines(lines, 360 - angle, offset).tolist(), 'script_detection': False }
def nlbin(im: Image.Image, threshold: float = 0.5, zoom: float = 0.5, escale: float = 1.0, border: float = 0.1, perc: int = 80, range: int = 20, low: int = 5, high: int = 90) -> Image: """ Performs binarization using non-linear processing. Args: im (PIL.Image.Image): threshold (float): zoom (float): Zoom for background page estimation escale (float): Scale for estimating a mask over the text region border (float): Ignore this much of the border perc (int): Percentage for filters range (int): Range for filters low (int): Percentile for black estimation high (int): Percentile for white estimation Returns: PIL.Image containing the binarized image Raises: KrakenInputException when trying to binarize an empty image. """ im_str = get_im_str(im) logger.info('Binarizing {}'.format(im_str)) if is_bitonal(im): logger.info('Skipping binarization because {} is bitonal.'.format(im_str)) return im # convert to grayscale first logger.debug('Converting {} to grayscale'.format(im_str)) im = im.convert('L') raw = pil2array(im) logger.debug('Scaling and normalizing') # rescale image to between -1 or 0 and 1 raw = raw/np.float(np.iinfo(raw.dtype).max) # perform image normalization if np.amax(raw) == np.amin(raw): logger.warning('Trying to binarize empty image {}'.format(im_str)) raise KrakenInputException('Image is empty') image = raw-np.amin(raw) image /= np.amax(image) logger.debug('Interpolation and percentile filtering') with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) m = interpolation.zoom(image, zoom) m = filters.percentile_filter(m, perc, size=(range, 2)) m = filters.percentile_filter(m, perc, size=(2, range)) mh, mw = m.shape oh, ow = image.shape scale = np.diag([mh * 1.0/oh, mw * 1.0/ow]) m = affine_transform(m, scale, output_shape=image.shape) w, h = np.minimum(np.array(image.shape), np.array(m.shape)) flat = np.clip(image[:w, :h]-m[:w, :h]+1, 0, 1) # estimate low and high thresholds d0, d1 = flat.shape o0, o1 = int(border*d0), int(border*d1) est = flat[o0:d0-o0, o1:d1-o1] logger.debug('Threshold estimates {}'.format(est)) # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable logger.debug('Refine estimates') v = est-filters.gaussian_filter(est, escale*20.0) v = filters.gaussian_filter(v**2, escale*20.0)**0.5 v = (v > 0.3*np.amax(v)) v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1))) v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50)))) est = est[v] lo = np.percentile(est.ravel(), low) hi = np.percentile(est.ravel(), high) flat -= lo flat /= (hi-lo) flat = np.clip(flat, 0, 1) logger.debug('Thresholding at {}'.format(threshold)) bin = np.array(255*(flat > threshold), 'B') return array2pil(bin)