def test_hocr_serialization(self): """ Test hOCR serialization """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='hocr')) doc = etree.fromstring(fp.getvalue())
def test_alto_serialization_validation(self): """ Validates output against ALTO schema """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='alto')) doc = etree.fromstring(fp.getvalue().encode('utf-8')) with open(os.path.join(resources, 'alto-4-0.xsd')) as schema_fp: alto_schema = etree.XMLSchema(etree.parse(schema_fp)) alto_schema.assertValid(doc)
def test_box_pagexml_serialization_validation(self): """ Validates output against abbyyXML schema """ fp = StringIO() fp.write( serialization.serialize(self.box_records, image_name='foo.png', template='pagexml')) validate_page(self, fp)
def test_box_alto_serialization_validation(self): """ Validates output against ALTO schema """ fp = StringIO() fp.write( serialization.serialize(self.box_records, image_name='foo.png', template='alto')) validate_alto(self, fp)
def test_box_hocr_serialization(self): """ Test hOCR serialization """ fp = StringIO() fp.write( serialization.serialize(self.box_records, image_name='foo.png', template='hocr')) validate_hocr(self, fp)
def test_abbyyxml_serialization_validation(self): """ Validates output against abbyyXML schema """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='abbyyxml')) doc = etree.fromstring(fp.getvalue().encode('utf-8')) with open(os.path.join(resources, 'FineReader10-schema-v1.xml')) as schema_fp: abbyy_schema = etree.XMLSchema(etree.parse(schema_fp)) abbyy_schema.assertValid(doc)
def test_hocr_serialization(self): """ Test hOCR serialization """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='hocr')) fp.seek(0) report = self.validator.validate(fp, parse_strict=True) self.assertTrue(report.is_valid())
def test_region_only_pagexml_serialization_validation(self): """ Validates output without baselines (but regions) against PageXML schema """ fp = StringIO() fp.write( serialization.serialize([], image_name='foo.png', template='pagexml', regions=self.bl_regions)) validate_page(self, fp)
def test_box_vertical_hocr_serialization(self): """ Test vertical line hOCR serialization """ fp = StringIO() fp.write( serialization.serialize(self.box_records, image_name='foo.png', writing_mode='vertical-lr', template='hocr')) validate_hocr(self, fp)
def test_hocr_serialization(self): """ Test hOCR serialization """ fp = StringIO() fp.write( serialization.serialize(self.records, image_name='foo.png', template='hocr')) doc = etree.fromstring(fp.getvalue())
def test_alto_serialization_validation(self): """ Validates output against ALTO schema """ fp = StringIO() fp.write(serialization.serialize(self.records, image_name='foo.png', template='alto')) doc = etree.fromstring(fp.getvalue()) print(fp.getvalue()[:2000]) with open(os.path.join(resources, 'alto-3-1.xsd')) as schema_fp: alto_schema = etree.XMLSchema(etree.parse(schema_fp)) alto_schema.assertValid(doc)
def recognizer(model, pad, bidi_reordering, base_image, input, output, lines): try: im = Image.open(base_image) except IOError as e: raise click.BadParameter(str(e)) ctx = click.get_current_context() scripts = None if not lines: lines = input with open_file(lines, 'r') as fp: bounds = json.load(fp) # script detection if bounds['script_detection']: scripts = set() for l in bounds['boxes']: for t in l: scripts.add(t[0]) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Executing multi-script recognition'.format(time.time() - st_time)) it = rpred.mm_rpred(model, im, bounds, pad, bidi_reordering=bidi_reordering) else: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Executing mono-script recognition'.format(time.time() - st_time)) it = rpred.rpred(model['default'], im, bounds, pad, bidi_reordering=bidi_reordering) preds = [] st_time = time.time() for pred in it: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction)) else: spin('Processing') preds.append(pred) if ctx.meta['verbose'] > 0: click.echo(u'Execution time: {}s'.format(time.time() - st_time)) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: click.echo('Writing recognition results for {}\t'.format(base_image), nl=False) if ctx.meta['mode'] != 'text': fp.write(serialization.serialize(preds, base_image, Image.open(base_image).size, ctx.meta['text_direction'], scripts, ctx.meta['mode'])) else: fp.write(u'\n'.join(s.prediction for s in preds)) if not ctx.meta['verbose']: click.secho(u'\u2713', fg='green')
def test_bl_region_alto_serialization_validation(self): """ Validates output against ALTO schema """ fp = StringIO() fp.write( serialization.serialize(self.bl_records, image_name='foo.png', template='alto', regions=self.bl_regions)) validate_alto(self, fp) roundtrip(self, self.bl_records, fp)
def test_bl_region_pagexml_serialization_validation(self): """ Validates output against PageXML schema """ fp = StringIO() fp.write( serialization.serialize(self.bl_records, image_name='foo.png', template='pagexml', regions=self.bl_regions)) validate_page(self, fp) roundtrip(self, self.bl_records, fp)
def test_bl_abbyyxml_serialization_validation(self): """ Validates output against abbyyXML schema """ fp = StringIO() fp.write( serialization.serialize(self.bl_records, image_name='foo.png', template='abbyyxml')) doc = etree.fromstring(fp.getvalue().encode('utf-8')) with open(resources / 'FineReader10-schema-v1.xml') as schema_fp: abbyy_schema = etree.XMLSchema(etree.parse(schema_fp)) abbyy_schema.assertValid(doc)
def test_pagexml_serialization_validation(self): """ Validates output against abbyyXML schema """ fp = StringIO() fp.write( serialization.serialize(self.records, image_name='foo.png', template='pagexml')) doc = etree.fromstring(fp.getvalue().encode('utf-8')) with open(os.path.join(resources, 'pagecontent.xsd')) as schema_fp: abbyy_schema = etree.XMLSchema(etree.parse(schema_fp)) abbyy_schema.assertValid(doc)
def binarizer(threshold, zoom, escale, border, perc, range, low, high, input, output) -> None: from kraken import binarization ctx = click.get_current_context() if ctx.meta['first_process']: if ctx.meta['input_format_type'] != 'image': input = get_input_parser( ctx.meta['input_format_type'])(input)['image'] ctx.meta['first_process'] = False else: raise click.UsageError('Binarization has to be the initial process.') try: im = Image.open(input) except IOError as e: raise click.BadParameter(str(e)) message('Binarizing\t', nl=False) try: res = binarization.nlbin(im, threshold, zoom, escale, border, perc, range, low, high) if ctx.meta['last_process'] and ctx.meta['output_mode'] != 'native': with click.open_file(output, 'w', encoding='utf-8') as fp: fp = cast(IO[Any], fp) logger.info('Serializing as {} into {}'.format( ctx.meta['output_mode'], output)) res.save(f'{output}.png') from kraken import serialization fp.write( serialization.serialize([], image_name=f'{output}.png', image_size=res.size, template=ctx.meta['output_mode'])) else: form = None ext = os.path.splitext(output)[1] if ext in ['.jpg', '.jpeg', '.JPG', '.JPEG', '']: form = 'png' if ext: logger.warning( 'jpeg does not support 1bpp images. Forcing to png.') res.save(output, format=form) ctx.meta['base_image'] = output except Exception: if ctx.meta['raise_failed']: raise message('\u2717', fg='red') ctx.exit(1) message('\u2713', fg='green')
def cli(format_type, model, output, files): """ A script producing overlays of lines and regions from either ALTO or PageXML files or run a model to do the same. """ if len(files) == 0: ctx = click.get_current_context() click.echo(ctx.get_help()) ctx.exit() from PIL import Image, ImageDraw from kraken.lib import models, xml from kraken import align, serialization if format_type == 'xml': fn = xml.parse_xml elif format_type == 'alto': fn = xml.parse_palto else: fn = xml.parse_page click.echo(f'Loading model {model}') net = models.load_any(model) for doc in files: click.echo(f'Processing {doc} ', nl=False) data = fn(doc) im = Image.open(data['image']).convert('RGBA') records = align.forced_align(data, net) if output == 'overlay': tmp = Image.new('RGBA', im.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(tmp) for record in records: for pol in record.cuts: c = next(cmap) draw.polygon([tuple(x) for x in pol], fill=c, outline=c[:3]) base_image = Image.alpha_composite(im, tmp) base_image.save(f'high_{os.path.basename(doc)}_algn.png') else: with open(f'{os.path.basename(doc)}_algn.xml', 'w') as fp: fp.write( serialization.serialize(records, image_name=data['image'], regions=data['regions'], template=output)) click.secho('\u2713', fg='green')
def recognizer(model, pad, base_image, input, output, lines): try: im = Image.open(base_image) except IOError as e: raise click.BadParameter(str(e)) ctx = click.get_current_context() if not lines: lines = input with open_file(lines, 'r') as fp: bounds = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in csv.reader(fp)] it = rpred.rpred(model, im, bounds, pad) preds = [] st_time = time.time() for pred in it: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction)) else: spin('Processing') preds.append(pred) if ctx.meta['verbose'] > 0: click.echo(u'Execution time: {}s'.format(time.time() - st_time)) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: click.echo('Writing recognition results for {}\t'.format(base_image), nl=False) if ctx.meta['mode'] != 'text': fp.write( serialization.serialize(preds, base_image, Image.open(base_image).size, ctx.meta['mode'])) else: fp.write(u'\n'.join(s.prediction for s in preds)) if not ctx.meta['verbose']: click.secho(u'\u2713', fg='green')
def recognizer(input_image, model, pad, no_segmentation, bidi_reordering, script_ignore, mode, text_direction, segments) -> None: bounds = segments # Script detection. if bounds['script_detection']: for l in bounds['boxes']: for t in l: scripts.add(t[0]) it = rpred.mm_rpred(model, input_image, bounds, pad, bidi_reordering=bidi_reordering, script_ignore=script_ignore) else: it = rpred.rpred(model['default'], input_image, bounds, pad, bidi_reordering=bidi_reordering) preds = [] with log.progressbar(it, label='Processing', length=len(bounds['boxes'])) as bar: for pred in bar: preds.append(pred) #-------------------- print('Recognition results = {}.'.format('\n'.join(s.prediction for s in preds))) if False: with open_file(output, 'w', encoding='utf-8') as fp: print('Serializing as {} into {}'.format(mode, output)) if mode != 'text': from kraken import serialization fp.write( serialization.serialize(preds, base_image, Image.open(base_image).size, text_direction, scripts, mode)) else: fp.write('\n'.join(s.prediction for s in preds))
def recognizer(model, pad, base_image, input, output, lines): try: im = Image.open(base_image) except IOError as e: raise click.BadParameter(str(e)) ctx = click.get_current_context() if not lines: lines = input with open_file(lines, 'r') as fp: bounds = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in csv.reader(fp)] it = rpred.rpred(model, im, bounds, pad) preds = [] st_time = time.time() for pred in it: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction)) else: spin('Processing') preds.append(pred) if ctx.meta['verbose'] > 0: click.echo(u'Execution time: {}s'.format(time.time() - st_time)) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: click.echo('Writing recognition results for {}\t'.format(base_image), nl=False) if ctx.meta['mode'] != 'text': fp.write(serialization.serialize(preds, base_image, Image.open(base_image).size, ctx.meta['mode'])) else: fp.write(u'\n'.join(s.prediction for s in preds)) if not ctx.meta['verbose']: click.secho(u'\u2713', fg='green')
def recognizer(model, pad, no_segmentation, bidi_reordering, script_ignore, input, output) -> None: import json from kraken import rpred ctx = click.get_current_context() bounds = None if 'base_image' not in ctx.meta: ctx.meta['base_image'] = input if ctx.meta['first_process']: if ctx.meta['input_format_type'] != 'image': doc = get_input_parser(ctx.meta['input_format_type'])(input) ctx.meta['base_image'] = doc['image'] doc['text_direction'] = 'horizontal-lr' bounds = doc try: im = Image.open(ctx.meta['base_image']) except IOError as e: raise click.BadParameter(str(e)) if not bounds and ctx.meta['base_image'] != input: with open_file(input, 'r') as fp: try: fp = cast(IO[Any], fp) bounds = json.load(fp) except ValueError as e: raise click.UsageError( f'{input} invalid segmentation: {str(e)}') elif not bounds: if no_segmentation: bounds = { 'script_detection': False, 'text_direction': 'horizontal-lr', 'boxes': [(0, 0) + im.size] } else: raise click.UsageError( 'No line segmentation given. Add one with the input or run `segment` first.' ) elif no_segmentation: logger.warning( 'no_segmentation mode enabled but segmentation defined. Ignoring --no-segmentation option.' ) scripts = set() # script detection if 'script_detection' in bounds and bounds['script_detection']: it = rpred.mm_rpred(model, im, bounds, pad, bidi_reordering=bidi_reordering, script_ignore=script_ignore) else: it = rpred.rpred(model['default'], im, bounds, pad, bidi_reordering=bidi_reordering) preds = [] with log.progressbar(it, label='Processing') as bar: for pred in bar: preds.append(pred) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: fp = cast(IO[Any], fp) message(f'Writing recognition results for {ctx.meta["orig_file"]}\t', nl=False) logger.info('Serializing as {} into {}'.format(ctx.meta['output_mode'], output)) if ctx.meta['output_mode'] != 'native': from kraken import serialization fp.write( serialization.serialize( preds, ctx.meta['base_image'], Image.open(ctx.meta['base_image']).size, ctx.meta['text_direction'], scripts, bounds['regions'] if 'regions' in bounds else None, ctx.meta['output_mode'])) else: fp.write('\n'.join(s.prediction for s in preds)) message('\u2713', fg='green')
def segmenter(legacy, model, text_direction, scale, maxcolseps, black_colseps, remove_hlines, pad, mask, device, input, output) -> None: import json from kraken import pageseg from kraken import blla ctx = click.get_current_context() if ctx.meta['first_process']: if ctx.meta['input_format_type'] != 'image': input = get_input_parser( ctx.meta['input_format_type'])(input)['image'] ctx.meta['first_process'] = False if 'base_image' not in ctx.meta: ctx.meta['base_image'] = input try: im = Image.open(input) except IOError as e: raise click.BadParameter(str(e)) if mask: try: mask = Image.open(mask) except IOError as e: raise click.BadParameter(str(e)) message('Segmenting\t', nl=False) try: if legacy: res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps, no_hlines=remove_hlines, pad=pad, mask=mask) else: res = blla.segment(im, text_direction, mask=mask, model=model, device=device) except Exception: message('\u2717', fg='red') raise if ctx.meta['last_process'] and ctx.meta['output_mode'] != 'native': with open_file(output, 'w', encoding='utf-8') as fp: fp = cast(IO[Any], fp) logger.info('Serializing as {} into {}'.format( ctx.meta['output_mode'], output)) from kraken import serialization from kraken.rpred import ocr_record if 'type' in res and res['type'] == 'baselines': records = [ocr_record('', '', '', bl) for bl in res['lines']] else: records = [] for line in res['boxes']: xmin, xmax = min(line[::2]), max(line[::2]) ymin, ymax = min(line[1::2]), max(line[1::2]) records.append( ocr_record('', [], [], [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]])) fp.write( serialization.serialize( records, image_name=ctx.meta['base_image'], image_size=im.size, regions=res['regions'] if 'regions' in res else None, template=ctx.meta['output_mode'])) else: with open_file(output, 'w') as fp: fp = cast(IO[Any], fp) json.dump(res, fp) message('\u2713', fg='green')
def recognizer(model, pad, no_segmentation, bidi_reordering, script_ignore, base_image, input, output, lines) -> None: import json import tempfile from kraken import rpred try: im = Image.open(base_image) except IOError as e: raise click.BadParameter(str(e)) ctx = click.get_current_context() # input may either be output from the segmenter then it is a JSON file or # be an image file when running the OCR subcommand alone. might still come # from some other subcommand though. scripts = set() if not lines and base_image != input: lines = input if not lines: if no_segmentation: lines = tempfile.NamedTemporaryFile(mode='w', delete=False) logger.info( 'Running in no_segmentation mode. Creating temporary segmentation {}.' .format(lines.name)) json.dump( { 'script_detection': False, 'text_direction': 'horizontal-lr', 'boxes': [(0, 0) + im.size] }, lines) lines.close() lines = lines.name else: raise click.UsageError( 'No line segmentation given. Add one with `-l` or run `segment` first.' ) elif no_segmentation: logger.warning( 'no_segmentation mode enabled but segmentation defined. Ignoring --no-segmentation option.' ) with open_file(lines, 'r') as fp: try: fp = cast(IO[Any], fp) bounds = json.load(fp) except ValueError as e: raise click.UsageError('{} invalid segmentation: {}'.format( lines, str(e))) # script detection if bounds['script_detection']: for l in bounds['boxes']: for t in l: scripts.add(t[0]) it = rpred.mm_rpred(model, im, bounds, pad, bidi_reordering=bidi_reordering, script_ignore=script_ignore) else: it = rpred.rpred(model['default'], im, bounds, pad, bidi_reordering=bidi_reordering) if not lines and no_segmentation: logger.debug('Removing temporary segmentation file.') os.unlink(lines.name) preds = [] with log.progressbar(it, label='Processing', length=len(bounds['boxes'])) as bar: for pred in bar: preds.append(pred) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: fp = cast(IO[Any], fp) message('Writing recognition results for {}\t'.format(base_image), nl=False) logger.info('Serializing as {} into {}'.format(ctx.meta['mode'], output)) if ctx.meta['mode'] != 'text': from kraken import serialization fp.write( serialization.serialize(preds, base_image, Image.open(base_image).size, ctx.meta['text_direction'], scripts, ctx.meta['mode'])) else: fp.write('\n'.join(s.prediction for s in preds)) message('\u2713', fg='green')
def recognizer(model, pad, no_segmentation, bidi_reordering, script_ignore, base_image, input, output, lines) -> None: import json import tempfile from kraken import rpred try: im = Image.open(base_image) except IOError as e: raise click.BadParameter(str(e)) ctx = click.get_current_context() # input may either be output from the segmenter then it is a JSON file or # be an image file when running the OCR subcommand alone. might still come # from some other subcommand though. scripts = set() if not lines and base_image != input: lines = input if not lines: if no_segmentation: lines = tempfile.NamedTemporaryFile(mode='w', delete=False) logger.info('Running in no_segmentation mode. Creating temporary segmentation {}.'.format(lines.name)) json.dump({'script_detection': False, 'text_direction': 'horizontal-lr', 'boxes': [(0, 0) + im.size]}, lines) lines.close() lines = lines.name else: raise click.UsageError('No line segmentation given. Add one with `-l` or run `segment` first.') elif no_segmentation: logger.warning('no_segmentation mode enabled but segmentation defined. Ignoring --no-segmentation option.') with open_file(lines, 'r') as fp: try: fp = cast(IO[Any], fp) bounds = json.load(fp) except ValueError as e: raise click.UsageError('{} invalid segmentation: {}'.format(lines, str(e))) # script detection if bounds['script_detection']: for l in bounds['boxes']: for t in l: scripts.add(t[0]) it = rpred.mm_rpred(model, im, bounds, pad, bidi_reordering=bidi_reordering, script_ignore=script_ignore) else: it = rpred.rpred(model['default'], im, bounds, pad, bidi_reordering=bidi_reordering) if not lines and no_segmentation: logger.debug('Removing temporary segmentation file.') os.unlink(lines.name) preds = [] with log.progressbar(it, label='Processing', length=len(bounds['boxes'])) as bar: for pred in bar: preds.append(pred) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: fp = cast(IO[Any], fp) message('Writing recognition results for {}\t'.format(base_image), nl=False) logger.info('Serializing as {} into {}'.format(ctx.meta['mode'], output)) if ctx.meta['mode'] != 'text': from kraken import serialization fp.write(serialization.serialize(preds, base_image, Image.open(base_image).size, ctx.meta['text_direction'], scripts, ctx.meta['mode'])) else: fp.write('\n'.join(s.prediction for s in preds)) message('\u2713', fg='green')
def recognizer(model, pad, bidi_reordering, script_ignore, base_image, input, output, lines): try: im = Image.open(base_image) except IOError as e: raise click.BadParameter(str(e)) ctx = click.get_current_context() scripts = None st_time = time.time() if not lines: lines = input with open_file(lines, 'r') as fp: bounds = json.load(fp) # script detection if bounds['script_detection']: scripts = set() for l in bounds['boxes']: for t in l: scripts.add(t[0]) it = rpred.mm_rpred(model, im, bounds, pad, bidi_reordering=bidi_reordering, script_ignore=script_ignore) else: it = rpred.rpred(model['default'], im, bounds, pad, bidi_reordering=bidi_reordering) preds = [] for pred in it: spin('Processing') preds.append(pred) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) ctx = click.get_current_context() with open_file(output, 'w', encoding='utf-8') as fp: message(u'Writing recognition results for {}\t'.format(base_image), nl=False) if PY2: output = output.decode('utf-8') logger.info(u'Serializing as {} into {}'.format( ctx.meta['mode'], output)) if ctx.meta['mode'] != 'text': fp.write( serialization.serialize(preds, base_image, Image.open(base_image).size, ctx.meta['text_direction'], scripts, ctx.meta['mode'])) else: fp.write(u'\n'.join(s.prediction for s in preds)) if not ctx.meta['verbose']: message(u'\u2713', fg='green')