Ejemplo n.º 1
0
def bidi_record(record):
    """
    Reorders a record using the Unicode BiDi algorithm.

    Models trained for RTL or mixed scripts still emit classes in LTR order
    requiring reordering for proper display.

    Args:
        record (kraken.rpred.ocr_record)

    Returns:
        kraken.rpred.ocr_record
    """
    storage = bd.get_empty_storage()
    base_level = bd.get_base_level(record.prediction)
    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bd.get_embedding_levels(record.prediction, storage)
    bd.explicit_embed_and_overrides(storage)
    bd.resolve_weak_types(storage)
    bd.resolve_neutral_types(storage, False)
    bd.resolve_implicit_levels(storage, False)
    for i, j in enumerate(record):
        storage['chars'][i]['record'] = j
    bd.reorder_resolved_levels(storage, False)
    bd.apply_mirroring(storage, False)
    prediction = u''
    cuts = []
    confidences = []
    for ch in storage['chars']:
        prediction = prediction + ch['record'][0]
        cuts.append(ch['record'][1])
        confidences.append(ch['record'][2])
    return ocr_record(prediction, cuts, confidences)
Ejemplo n.º 2
0
def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False):
    """
    Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
    for debugging (default: False).

    Set `base_dir` to 'L' or 'R' to override the calculated base_level.

    Set `debug` to True to display (using sys.stderr) the steps taken with the
    algorithm.

    Returns an info dict object and the display layout.
    """
    storage = get_empty_storage()

    if base_dir is None:
        base_level = get_base_level(text, upper_is_rtl)
    else:
        base_level = PARAGRAPH_LEVELS[base_dir]

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    get_embedding_levels(text, storage, upper_is_rtl, debug)
    assert len(text) == len(storage["chars"])
    for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])):
        assert ch == chInfo["ch"]
        chInfo["index"] = index

    explicit_embed_and_overrides(storage, debug)
    resolve_weak_types(storage, debug)
    resolve_neutral_types(storage, debug)
    resolve_implicit_levels(storage, debug)
    reorder_resolved_levels(storage, debug)

    return storage
Ejemplo n.º 3
0
def bidi_record(record):
    """
    Reorders a record using the Unicode BiDi algorithm. 
    
    Models trained for RTL or mixed scripts still emit classes in LTR order
    requiring reordering for proper display.

    Args:
        record (kraken.rpred.ocr_record)

    Returns:
        kraken.rpred.ocr_record 
    """
    storage = bd.get_empty_storage()
    base_level = bd.get_base_level(record.prediction)
    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bd.get_embedding_levels(record.prediction, storage)
    bd.explicit_embed_and_overrides(storage)
    bd.resolve_weak_types(storage)
    bd.resolve_neutral_types(storage, False)
    bd.resolve_implicit_levels(storage, False)
    for i, j in enumerate(record):
        storage['chars'][i]['record'] = j
    bd.reorder_resolved_levels(storage, False)
    bd.apply_mirroring(storage, False)
    prediction = u''
    cuts = []
    confidences = []
    for ch in storage['chars']:
        prediction = prediction + ch['record'][0]
        cuts.append(ch['record'][1])
        confidences.append(ch['record'][2])
    return ocr_record(prediction, cuts, confidences)
Ejemplo n.º 4
0
def get_display_mod(unicode_or_str,
                    encoding='utf-8',
                    upper_is_rtl=False,
                    base_dir=None,
                    debug=False):
    """Accepts unicode or string. In case it's a string, `encoding`
    is needed as it works on unicode ones (default:"utf-8").
    Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
    for debugging (default: False).
    Set `base_dir` to 'L' or 'R' to override the calculated base_level.
    Set `debug` to True to display (using sys.stderr) the steps taken with the
    algorithm.
    Returns the display layout, either as unicode or `encoding` encoded
    string.
    """
    storage = bidi.get_empty_storage()

    # utf-8 ? we need unicode
    if isinstance(unicode_or_str, six.text_type):
        text = unicode_or_str
        decoded = False
    else:
        text = unicode_or_str.decode(encoding)
        decoded = True

    if base_dir is None:
        base_level = bidi.get_base_level(text, upper_is_rtl)
    else:
        base_level = bidi.PARAGRAPH_LEVELS[base_dir]

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bidi.get_embedding_levels(text, storage, upper_is_rtl, debug)
    bidi.explicit_embed_and_overrides(storage, debug)
    bidi.resolve_weak_types(storage, debug)
    bidi.resolve_neutral_types(storage, debug)
    bidi.resolve_implicit_levels(storage, debug)
    bidi.reorder_resolved_levels(storage, debug)
    #Commented out from original code:
    # bidi.apply_mirroring(storage, debug)
    # print_storage_chars(storage)
    # chars = storage['chars']
    # display = u''.join([_ch['ch'] for _ch in chars])
    display = print_storage_chars(storage)

    if decoded:
        return display.encode(encoding)
    else:
        return display
Ejemplo n.º 5
0
def bidi_record(record: ocr_record, base_dir=None) -> ocr_record:
    """
    Reorders a record using the Unicode BiDi algorithm.

    Models trained for RTL or mixed scripts still emit classes in LTR order
    requiring reordering for proper display.

    Args:
        record (kraken.rpred.ocr_record)

    Returns:
        kraken.rpred.ocr_record
    """
    storage = bd.get_empty_storage()

    if base_dir not in ('L', 'R'):
        base_level = bd.get_base_level(record.prediction)
    else:
        base_level = {'L': 0, 'R': 1}[base_dir]

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bd.get_embedding_levels(record.prediction, storage)
    bd.explicit_embed_and_overrides(storage)
    bd.resolve_weak_types(storage)
    bd.resolve_neutral_types(storage, False)
    bd.resolve_implicit_levels(storage, False)
    for i, j in enumerate(record):
        storage['chars'][i]['record'] = j
    bd.reorder_resolved_levels(storage, False)
    bd.apply_mirroring(storage, False)
    prediction = ''
    cuts = []
    confidences = []
    for ch in storage['chars']:
        # code point may have been mirrored
        prediction = prediction + ch['ch']
        cuts.append(ch['record'][1])
        confidences.append(ch['record'][2])
    # carry over whole line information
    if record.type == 'baselines':
        line = {'boundary': record.line, 'baseline': record.baseline}
    else:
        line = record.line
    rec = ocr_record(prediction, cuts, confidences, line)
    rec.tags = record.tags
    rec.base_dir = base_dir
    return rec
Ejemplo n.º 6
0
    def test_surrogate(self):
        """Test for storage and base levels in case of surrogate pairs"""

        storage = get_empty_storage()

        text = u'HELLO \U0001d7f612'
        get_embedding_levels(text, storage, upper_is_rtl=True)

        # should return 9, not 10 even in --with-unicode=ucs2
        self.assertEqual(len(storage['chars']), 9)

        # Is the expected result ? should be EN
        _ch = storage['chars'][6]
        self.assertEqual(_ch['ch'], u'\U0001d7f6')
        self.assertEqual(_ch['type'], 'EN')

        display = get_display(text, upper_is_rtl=True)
        self.assertEqual(display, u'\U0001d7f612 OLLEH')