def serialize_segmentation(segresult: Dict[str, Any], image_name: str = None, image_size: Tuple[int, int] = (0, 0), template: str = 'hocr') -> str: """ Serializes a segmentation result into an output document. Args: segresult: Result of blla.segment image_name (str): Name of the source image image_size (tuple): Dimensions of the source image template (str): Selector for the serialization format. May be 'hocr' or 'alto'. Returns: (str) rendered template. """ if 'type' in segresult and segresult['type'] == 'baselines': records = [ocr_record('', '', '', bl) for bl in segresult['lines']] else: records = [] for line in segresult['boxes']: xmin, xmax = min(line[::2]), max(line[::2]) ymin, ymax = min(line[1::2]), max(line[1::2]) records.append( ocr_record( '', [], [], [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]])) return serialize( records, image_name=image_name, image_size=image_size, regions=segresult['regions'] if 'regions' in segresult else None, template=template)
def setUp(self): with open(resources / 'records.json', 'r') as fp: self.box_records = [rpred.ocr_record(**x) for x in json.load(fp)] with open(resources / 'bl_records.json', 'r') as fp: recs = json.load(fp) self.bl_records = [rpred.ocr_record(**bl) for bl in recs['lines']] self.bl_regions = recs['regions']
def get_bounding_boxes_from_transcription(path): doc = html.parse(path) etree.strip_tags(doc, etree.Comment) td = doc.find(".//meta[@itemprop='text_direction']") if td is None: td = 'horizontal-lr' else: td = td.attrib['content'] records = [] for isection, section in enumerate(doc.xpath('//section')): img_data = section.find('.//img').attrib['src'] img_data = img_data[len('data:image/png;base64,'):] im = Image.open(BytesIO(base64.b64decode(img_data))) records.append({ "writing_mode":td, "lines": [], "image_size": im.size }) for line in section.iter('li'): if line.get('contenteditable') and (not u''.join(line.itertext()).isspace() and u''.join(line.itertext())): left, upper, right, lower = [int(x) for x in line.get('data-bbox').split(',')] # add some margin on the edges width = right-left height = lower-upper left = int(left - width*.025) right = int(right + width*.025) upper = int(upper - height*.025) lower = int(lower + height*.05) raw = u''.join(line.itertext()).strip() text = "".join(translate_char(char) for char in raw if unicodedata.category(char)[0] != "C") rec = ocr_record( text, [left,upper,right,lower], [1.0]*len(text) ) records[-1]["lines"].append({ 'text': rec.prediction, 'bbox': rec.cuts }) return records # recs = get_bounding_boxes_from_transcription('../data/transcriptions/2jMfAAAAMAAJ/transcribe.html')
def forced_align(doc, model): """ Performs a forced character alignment of text with recognition model output activations. Argument: doc (dict): Parsed document. model (kraken.lib.model.TorchSeqRecognizer): Recognition model to use for alignment. Returns: A list of kraken.rpred.ocr_record. """ im = Image.open(doc['image']) predictor = rpred.rpred(model, im, doc) records = [] for line in doc['lines']: bidi_text = get_display(line['text']) gt_fst = fst_from_text(bidi_text, model.codec) next(predictor) lat_fst = fst_from_lattice(model.outputs) composed_graph = _compose(lat_fst, gt_fst) short_graph = _shortest_path(composed_graph) pred = [] pos = [] conf = [] for act, label in _generate_line_record(short_graph, model.outputs.shape[2] - 1): pos.append( compute_polygon_section( line['baseline'], line['boundary'], predictor._scale_val(act[0], 0, predictor.box.size[0]), predictor._scale_val(act[1], 0, predictor.box.size[0]))) conf.append(1.0) pred.append(model.codec.decode([(label, 0, 0, 0)])[0][0]) records.append( rpred.bidi_record(rpred.ocr_record(pred, pos, conf, line))) return records
def segmenter(legacy, model, text_direction, scale, maxcolseps, black_colseps, remove_hlines, pad, mask, device, input, output) -> None: import json from kraken import pageseg from kraken import blla ctx = click.get_current_context() if ctx.meta['first_process']: if ctx.meta['input_format_type'] != 'image': input = get_input_parser( ctx.meta['input_format_type'])(input)['image'] ctx.meta['first_process'] = False if 'base_image' not in ctx.meta: ctx.meta['base_image'] = input try: im = Image.open(input) except IOError as e: raise click.BadParameter(str(e)) if mask: try: mask = Image.open(mask) except IOError as e: raise click.BadParameter(str(e)) message('Segmenting\t', nl=False) try: if legacy: res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps, no_hlines=remove_hlines, pad=pad, mask=mask) else: res = blla.segment(im, text_direction, mask=mask, model=model, device=device) except Exception: message('\u2717', fg='red') raise if ctx.meta['last_process'] and ctx.meta['output_mode'] != 'native': with open_file(output, 'w', encoding='utf-8') as fp: fp = cast(IO[Any], fp) logger.info('Serializing as {} into {}'.format( ctx.meta['output_mode'], output)) from kraken import serialization from kraken.rpred import ocr_record if 'type' in res and res['type'] == 'baselines': records = [ocr_record('', '', '', bl) for bl in res['lines']] else: records = [] for line in res['boxes']: xmin, xmax = min(line[::2]), max(line[::2]) ymin, ymax = min(line[1::2]), max(line[1::2]) records.append( ocr_record('', [], [], [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]])) fp.write( serialization.serialize( records, image_name=ctx.meta['base_image'], image_size=im.size, regions=res['regions'] if 'regions' in res else None, template=ctx.meta['output_mode'])) else: with open_file(output, 'w') as fp: fp = cast(IO[Any], fp) json.dump(res, fp) message('\u2713', fg='green')
def setUp(self): with open(os.path.join(resources, 'records.json'), 'r') as fp: self.records = [rpred.ocr_record(**x) for x in json.load(fp)] self.validator = HocrValidator('standard')
def setUp(self): with open(os.path.join(resources, 'records.json'), 'r') as fp: self.records = [rpred.ocr_record(**x) for x in json.load(fp)]