def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr: builder = BistrBuilder(bs) us = icu.UnicodeString(bs.modified) offset = 0 while not builder.is_complete: i = normalizer.spanQuickCheckYes(us) builder.skip(us.countChar32(0, i)) if builder.is_complete: break us = us[i:] i = 0 while i < len(us): if us.charAt(i) & 0xFC00 == 0xD800: i += 1 i += 1 if normalizer.hasBoundaryBefore(chr(us.char32At(i))): break chunk = us[:i] normalized = str(normalizer.normalize(chunk)) builder.replace(chunk.countChar32(), normalized) us = us[i:] return builder.build()
def gen_boundary_regex(normalizer: icu.Normalizer2) -> str: ranges = [] for cp in range(0x110000): if not normalizer.hasBoundaryBefore(chr(cp)): if ranges and cp == ranges[-1].stop: ranges[-1] = range(ranges[-1].start, cp + 1) else: ranges.append(range(cp, cp + 1)) chunks = ['/.['] for r in ranges: chunks.append(escape(r.start)) if len(r) > 1: chunks.append('-') chunks.append(escape(r.stop - 1)) chunks.append(']*/gsu') return "".join(chunks)
def _normalize(normalizer: icu.Normalizer2, bs: bistr) -> bistr: builder = BistrBuilder(bs) current = builder.current while not builder.is_complete: i = builder.position j = i + 1 while j < len(current) and not normalizer.hasBoundaryBefore(current[j]): j += 1 chunk = current[i:j] repl = normalizer.normalize(chunk) if repl == chunk: builder.skip(len(chunk)) else: builder.replace(len(chunk), repl) return builder.build()