def cli(format_type, model, repolygonize, files): """ A small script extracting rectified line polygons as defined in either ALTO or PageXML files or run a model to do the same. """ if len(files) == 0: ctx = click.get_current_context() click.echo(ctx.get_help()) ctx.exit() from PIL import Image from os.path import splitext from kraken import blla from kraken.lib import dataset, segmentation, vgsl, xml if model is None: for doc in files: click.echo(f'Processing {doc} ', nl=False) data = xml.preparse_xml_data([doc], format_type, repolygonize=repolygonize) if len(data) > 0: bounds = {'type': 'baselines', 'lines': [{'boundary': t['boundary'], 'baseline': t['baseline'], 'text': t['text']} for t in data]} for idx, (im, box) in enumerate(segmentation.extract_polygons(Image.open(data[0]['image']), bounds)): click.echo('.', nl=False) im.save('{}.{}.jpg'.format(splitext(data[0]['image'])[0], idx)) with open('{}.{}.gt.txt'.format(splitext(data[0]['image'])[0], idx), 'w') as fp: fp.write(box['text']) else: net = vgsl.TorchVGSLModel.load_model(model) for doc in files: click.echo(f'Processing {doc} ', nl=False) full_im = Image.open(doc) bounds = blla.segment(full_im, model=net) for idx, (im, box) in enumerate(segmentation.extract_polygons(full_im, bounds)): click.echo('.', nl=False) im.save('{}.{}.jpg'.format(splitext(doc)[0], idx))
def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.preload: x, y = self.training_set[index] if self.aug: x = x.permute((1, 2, 0)).numpy() o = self.aug(image=x) x = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': x, 'target': y} else: item = self.training_set[index] try: logger.debug(f'Attempting to load {item[0]}') im = item[0][0] if not isinstance(im, Image.Image): im = Image.open(im) im, _ = next(extract_polygons(im, {'type': 'baselines', 'lines': [{'baseline': item[0][1], 'boundary': item[0][2]}]})) im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) if self.aug: im = im.permute((1, 2, 0)).numpy() o = self.aug(image=im) im = torch.tensor(o['image'].transpose(2, 0, 1)) return {'image': im, 'target': item[1]} except Exception: idx = np.random.randint(0, len(self.training_set)) logger.debug(traceback.format_exc()) logger.info(f'Failed. Replacing with sample {idx}') return self[np.random.randint(0, len(self.training_set))]
def _extract_line(xml_record): lines = [] try: im = Image.open(xml_record['image']) except FileNotFoundError: return lines, None, None if is_bitonal(im): im = im.convert('1') line_counts = Counter({'all': 0, 'train': 0, 'validation': 0, 'test': 0}) seg_key = 'lines' if 'lines' in xml_record else 'boxes' recs = xml_record.pop(seg_key) for idx, rec in enumerate(recs): try: line_im, line = next( extract_polygons(im, { **xml_record, seg_key: [rec] })) except KrakenInputException: logger.warning(f'Invalid line {idx} in {im.filename}') continue if not line['text']: continue fp = io.BytesIO() line_im.save(fp, format='png') if line['split']: line_counts[line['split']] += 1 else: line_counts['all'] += 1 lines.append({'text': line['text'], 'im': fp.getvalue()}) return lines, im.mode
def _recognize_baseline_line(self, line): if self.tags_ignore is not None: for tag in line['lines'][0]['tags'].values(): if tag in self.tags_ignore: logger.info( f'Ignoring line segment with tags {line["lines"][0]["tags"]} based on {tag}.' ) return ocr_record('', [], [], line['lines'][0]) try: box, coords = next(extract_polygons(self.im, line)) except KrakenInputException as e: logger.warning(f'Extracting line failed: {e}') return ocr_record('', [], [], line['lines'][0]) self.box = box tag, net = self._resolve_tags_to_model(coords['tags'], self.nets) # check if boxes are non-zero in any dimension if 0 in box.size: logger.warning( f'bbox {coords} with zero dimension. Emitting empty record.') return ocr_record('', [], [], coords) # try conversion into tensor try: line = self.ts[tag](box) except Exception: return ocr_record('', [], [], coords) # check if line is non-zero if line.max() == line.min(): return ocr_record('', [], [], coords) preds = net.predict(line.unsqueeze(0))[0] # calculate recognized LSTM locations of characters # scale between network output and network input self.net_scale = line.shape[2] / net.outputs.shape[2] # scale between network input and original line self.in_scale = box.size[0] / (line.shape[2] - 2 * self.pad) # XXX: fix bounding box calculation ocr_record for multi-codepoint labels. pred = ''.join(x[0] for x in preds) pos = [] conf = [] for _, start, end, c in preds: pos.append( compute_polygon_section( coords['baseline'], coords['boundary'], self._scale_val(start, 0, self.box.size[0]), self._scale_val(end, 0, self.box.size[0]))) conf.append(c) if self.bidi_reordering: logger.debug('BiDi reordering record.') return bidi_record(ocr_record(pred, pos, conf, coords), base_dir=self.bidi_reordering if self.bidi_reordering in ('L', 'R') else None) else: logger.debug('Emitting raw record') return ocr_record(pred, pos, conf, coords)
def _recognize_baseline_line(self, line): try: box, coords = next(extract_polygons(self.im, line)) except KrakenInputException as e: logger.warning(f'Extracting line failed: {e}') return ocr_record('', [], [], line['lines'][0]) script = coords['script'] # check if boxes are non-zero in any dimension if 0 in box.size: logger.warning( 'bbox {} with zero dimension. Emitting empty record.'.format( coords)) return ocr_record('', [], [], coords) # try conversion into tensor try: line = self.ts[script](box) except Exception: return ocr_record('', [], [], coords) # check if line is non-zero if line.max() == line.min(): return ocr_record('', [], [], coords) preds = self.nets[script].predict(line.unsqueeze(0))[0] # calculate recognized LSTM locations of characters # scale between network output and network input net_scale = line.shape[2] / self.nets[script].outputs.shape[2] # scale between network input and original line in_scale = box.size[0] / (line.shape[2] - 2 * self.pad) def _scale_val(val, min_val, max_val): return int( round( min( max(((val * net_scale) - self.pad) * in_scale, min_val), max_val - 1))) # XXX: fix bounding box calculation ocr_record for multi-codepoint labels. pred = ''.join(x[0] for x in preds) pos = [] conf = [] for _, start, end, c in preds: pos.append( compute_polygon_section(coords['baseline'], coords['boundary'], _scale_val(start, 0, box.size[0]), _scale_val(end, 0, box.size[0]))) conf.append(c) if self.bidi_reordering: logger.debug('BiDi reordering record.') rec = bidi_record(ocr_record(pred, pos, conf, coords)) return rec else: logger.debug('Emitting raw record') return ocr_record(pred, pos, conf, coords)
def add(self, image: Union[str, Image.Image], text: str, baseline: List[Tuple[int, int]], boundary: List[Tuple[int, int]], *args, **kwargs): """ Adds a line to the dataset. Args: im (path): Path to the whole page image text (str): Transcription of the line. baseline (list): A list of coordinates [[x0, y0], ..., [xn, yn]]. boundary (list): A polygon mask for the line. """ for func in self.text_transforms: text = func(text) if not text: raise KrakenInputException( 'Text line is empty after transformations') if not baseline: raise KrakenInputException('No baseline given for line') if not boundary: raise KrakenInputException('No boundary given for line') if self.preload: if not isinstance(image, Image.Image): im = Image.open(image) try: im, _ = next( extract_polygons( im, { 'type': 'baselines', 'lines': [{ 'baseline': baseline, 'boundary': boundary }] })) except IndexError: raise KrakenInputException( 'Patch extraction failed for baseline') try: im = self.head_transforms(im) if not is_bitonal(im): self.im_mode = im.mode im = self.tail_transforms(im) except ValueError: raise KrakenInputException( f'Image transforms failed on {image}') self._images.append(im) else: self._images.append((image, baseline, boundary)) self._gt.append(text) self.alphabet.update(text)
def parse(self, image: Union[str, Image.Image], text: str, baseline: List[Tuple[int, int]], boundary: List[Tuple[int, int]], *args, **kwargs): """ Parses a sample for the dataset and returns it. This function is mainly uses for parallelized loading of training data. Args: im (path): Path to the whole page image text (str): Transcription of the line. baseline (list): A list of coordinates [[x0, y0], ..., [xn, yn]]. boundary (list): A polygon mask for the line. """ for func in self.text_transforms: text = func(text) if not text: raise KrakenInputException('Text line is empty after transformations') if not baseline: raise KrakenInputException('No baseline given for line') if not boundary: raise KrakenInputException('No boundary given for line') if self.preload: if not isinstance(image, Image.Image): im = Image.open(image) try: im, _ = next(extract_polygons(im, {'type': 'baselines', 'lines': [{'baseline': baseline, 'boundary': boundary}]})) except IndexError: raise KrakenInputException('Patch extraction failed for baseline') try: im = self.head_transforms(im) im = self.tail_transforms(im) except ValueError: raise KrakenInputException(f'Image transforms failed on {image}') self._images.append(im) return {'text': text, 'image': im, 'baseline': baseline, 'boundary': boundary, 'im_mode': im.mode, 'preload': True, 'preparse': True} else: return {'text': text, 'image': image, 'baseline': baseline, 'boundary': boundary, 'preload': False, 'preparse': True}
def _recognize_box_line(self, line): flat_box = [point for box in line['boxes'][0] for point in box[1]] xmin, xmax = min(flat_box[::2]), max(flat_box[::2]) ymin, ymax = min(flat_box[1::2]), max(flat_box[1::2]) rec = ocr_record('', [], [], [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]]) for script, (box, coords) in zip(map(lambda x: x[0], line['boxes'][0]), extract_polygons(self.im, {'text_direction': line['text_direction'], 'boxes': map(lambda x: x[1], line['boxes'][0])})): # skip if script is set to ignore if self.script_ignore is not None and script in self.script_ignore: logger.info('Ignoring {} line segment.'.format(script)) continue # check if boxes are non-zero in any dimension if 0 in box.size: logger.warning('bbox {} with zero dimension. Emitting empty record.'.format(coords)) continue # try conversion into tensor try: logger.debug('Preparing run.') line = self.ts[script](box) except Exception: logger.warning('Conversion of line {} failed. Skipping.'.format(coords)) continue # check if line is non-zero if line.max() == line.min(): logger.warning('Empty run. Skipping.') continue logger.debug('Forward pass with model {}'.format(script)) preds = self.nets[script].predict(line.unsqueeze(0))[0] # calculate recognized LSTM locations of characters logger.debug('Convert to absolute coordinates') # calculate recognized LSTM locations of characters # scale between network output and network input net_scale = line.shape[2]/self.nets[script].outputs.shape[2] # scale between network input and original line in_scale = box.size[0]/(line.shape[2]-2*self.pad) def _scale_val(val, min_val, max_val): return int(round(min(max(((val*net_scale)-self.pad)*in_scale, min_val), max_val))) pred = ''.join(x[0] for x in preds) pos = [] conf = [] for _, start, end, c in preds: if self.bounds['text_direction'].startswith('horizontal'): xmin = coords[0] + _scale_val(start, 0, box.size[0]) xmax = coords[0] + _scale_val(end, 0, box.size[0]) pos.append([[xmin, coords[1]], [xmin, coords[3]], [xmax, coords[3]], [xmax, coords[1]]]) else: ymin = coords[1] + _scale_val(start, 0, box.size[1]) ymax = coords[1] + _scale_val(start, 0, box.size[1]) pos.append([[coords[0], ymin], [coords[2], ymin], [coords[2], ymax], [coords[0], ymax]]) conf.append(c) rec.prediction += pred rec.cuts.extend(pos) rec.confidences.extend(conf) if self.bidi_reordering: logger.debug('BiDi reordering record.') return bidi_record(rec) else: logger.debug('Emitting raw record') return rec
def _recognize_box_line(self, line): flat_box = [point for box in line['boxes'][0] for point in box[1]] xmin, xmax = min(flat_box[::2]), max(flat_box[::2]) ymin, ymax = min(flat_box[1::2]), max(flat_box[1::2]) rec = ocr_record( '', [], [], [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]]) for tag, (box, coords) in zip( map(lambda x: x[0], line['boxes'][0]), extract_polygons( self.im, { 'text_direction': line['text_direction'], 'boxes': map(lambda x: x[1], line['boxes'][0]) })): self.box = box # skip if tag is set to ignore if self.tags_ignore is not None and tag in self.tags_ignore: logger.warning(f'Ignoring {tag} line segment.') continue # check if boxes are non-zero in any dimension if 0 in box.size: logger.warning( f'bbox {coords} with zero dimension. Emitting empty record.' ) continue # try conversion into tensor try: logger.debug('Preparing run.') line = self.ts[tag](box) except Exception: logger.warning( f'Conversion of line {coords} failed. Skipping.') continue # check if line is non-zero if line.max() == line.min(): logger.warning('Empty run. Skipping.') continue _, net = self._resolve_tags_to_model({'type': tag}, self.nets) logger.debug(f'Forward pass with model {tag}.') preds = net.predict(line.unsqueeze(0))[0] # calculate recognized LSTM locations of characters logger.debug('Convert to absolute coordinates') # calculate recognized LSTM locations of characters # scale between network output and network input self.net_scale = line.shape[2] / net.outputs.shape[2] # scale between network input and original line self.in_scale = box.size[0] / (line.shape[2] - 2 * self.pad) pred = ''.join(x[0] for x in preds) pos = [] conf = [] for _, start, end, c in preds: if self.bounds['text_direction'].startswith('horizontal'): xmin = coords[0] + self._scale_val(start, 0, self.box.size[0]) xmax = coords[0] + self._scale_val(end, 0, self.box.size[0]) pos.append([[xmin, coords[1]], [xmin, coords[3]], [xmax, coords[3]], [xmax, coords[1]]]) else: ymin = coords[1] + self._scale_val(start, 0, self.box.size[1]) ymax = coords[1] + self._scale_val(end, 0, self.box.size[1]) pos.append([[coords[0], ymin], [coords[2], ymin], [coords[2], ymax], [coords[0], ymax]]) conf.append(c) rec.prediction += pred rec.cuts.extend(pos) rec.confidences.extend(conf) if self.bidi_reordering: logger.debug('BiDi reordering record.') return bidi_record(rec, base_dir=self.bidi_reordering if self.bidi_reordering in ('L', 'R') else None) else: logger.debug('Emitting raw record') return rec