コード例 #1
0
ファイル: _icu.py プロジェクト: zxlzr/bistring
def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr:
    builder = BistrBuilder(bs)
    us = icu.UnicodeString(bs.modified)
    offset = 0
    while not builder.is_complete:
        i = normalizer.spanQuickCheckYes(us)
        builder.skip(us.countChar32(0, i))
        if builder.is_complete:
            break
        us = us[i:]

        i = 0
        while i < len(us):
            if us.charAt(i) & 0xFC00 == 0xD800:
                i += 1
            i += 1
            if normalizer.hasBoundaryBefore(chr(us.char32At(i))):
                break

        chunk = us[:i]
        normalized = str(normalizer.normalize(chunk))
        builder.replace(chunk.countChar32(), normalized)
        us = us[i:]

    return builder.build()
コード例 #2
0
ファイル: generate_unicode.py プロジェクト: yazici/bistring
def gen_boundary_regex(normalizer: icu.Normalizer2) -> str:
    ranges = []
    for cp in range(0x110000):
        if not normalizer.hasBoundaryBefore(chr(cp)):
            if ranges and cp == ranges[-1].stop:
                ranges[-1] = range(ranges[-1].start, cp + 1)
            else:
                ranges.append(range(cp, cp + 1))

    chunks = ['/.[']
    for r in ranges:
        chunks.append(escape(r.start))
        if len(r) > 1:
            chunks.append('-')
            chunks.append(escape(r.stop - 1))
    chunks.append(']*/gsu')

    return "".join(chunks)
コード例 #3
0
ファイル: _icu.py プロジェクト: yazici/bistring
def _normalize(normalizer: icu.Normalizer2, bs: bistr) -> bistr:
    builder = BistrBuilder(bs)
    current = builder.current

    while not builder.is_complete:
        i = builder.position
        j = i + 1
        while j < len(current) and not normalizer.hasBoundaryBefore(current[j]):
            j += 1

        chunk = current[i:j]
        repl = normalizer.normalize(chunk)
        if repl == chunk:
            builder.skip(len(chunk))
        else:
            builder.replace(len(chunk), repl)

    return builder.build()