def bidi_record(record): """ Reorders a record using the Unicode BiDi algorithm. Models trained for RTL or mixed scripts still emit classes in LTR order requiring reordering for proper display. Args: record (kraken.rpred.ocr_record) Returns: kraken.rpred.ocr_record """ storage = bd.get_empty_storage() base_level = bd.get_base_level(record.prediction) storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bd.get_embedding_levels(record.prediction, storage) bd.explicit_embed_and_overrides(storage) bd.resolve_weak_types(storage) bd.resolve_neutral_types(storage, False) bd.resolve_implicit_levels(storage, False) for i, j in enumerate(record): storage['chars'][i]['record'] = j bd.reorder_resolved_levels(storage, False) bd.apply_mirroring(storage, False) prediction = u'' cuts = [] confidences = [] for ch in storage['chars']: prediction = prediction + ch['record'][0] cuts.append(ch['record'][1]) confidences.append(ch['record'][2]) return ocr_record(prediction, cuts, confidences)
def bidi_record(record: ocr_record, base_dir=None) -> ocr_record: """ Reorders a record using the Unicode BiDi algorithm. Models trained for RTL or mixed scripts still emit classes in LTR order requiring reordering for proper display. Args: record (kraken.rpred.ocr_record) Returns: kraken.rpred.ocr_record """ storage = bd.get_empty_storage() if base_dir not in ('L', 'R'): base_level = bd.get_base_level(record.prediction) else: base_level = {'L': 0, 'R': 1}[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bd.get_embedding_levels(record.prediction, storage) bd.explicit_embed_and_overrides(storage) bd.resolve_weak_types(storage) bd.resolve_neutral_types(storage, False) bd.resolve_implicit_levels(storage, False) for i, j in enumerate(record): storage['chars'][i]['record'] = j bd.reorder_resolved_levels(storage, False) bd.apply_mirroring(storage, False) prediction = '' cuts = [] confidences = [] for ch in storage['chars']: # code point may have been mirrored prediction = prediction + ch['ch'] cuts.append(ch['record'][1]) confidences.append(ch['record'][2]) # carry over whole line information if record.type == 'baselines': line = {'boundary': record.line, 'baseline': record.baseline} else: line = record.line rec = ocr_record(prediction, cuts, confidences, line) rec.tags = record.tags rec.base_dir = base_dir return rec
def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False): """ Set `upper_is_rtl` to True to treat upper case chars as strong 'R' for debugging (default: False). Set `base_dir` to 'L' or 'R' to override the calculated base_level. Set `debug` to True to display (using sys.stderr) the steps taken with the algorithm. Returns an info dict object and the display layout. """ storage = get_empty_storage() if base_dir is None: base_level = get_base_level(text, upper_is_rtl) else: base_level = PARAGRAPH_LEVELS[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] get_embedding_levels(text, storage, upper_is_rtl, debug) assert len(text) == len(storage["chars"]) for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])): assert ch == chInfo["ch"] chInfo["index"] = index explicit_embed_and_overrides(storage, debug) resolve_weak_types(storage, debug) resolve_neutral_types(storage, debug) resolve_implicit_levels(storage, debug) reorder_resolved_levels(storage, debug) reorder_combining_marks(storage, debug) apply_mirroring(storage, debug) chars = storage['chars'] display = ''.join([_ch['ch'] for _ch in chars]) return storage, display