def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr: builder = BistrBuilder(bs) us = icu.UnicodeString(bs.modified) offset = 0 while not builder.is_complete: i = normalizer.spanQuickCheckYes(us) builder.skip(us.countChar32(0, i)) if builder.is_complete: break us = us[i:] i = 0 while i < len(us): if us.charAt(i) & 0xFC00 == 0xD800: i += 1 i += 1 if normalizer.hasBoundaryBefore(chr(us.char32At(i))): break chunk = us[:i] normalized = str(normalizer.normalize(chunk)) builder.replace(chunk.countChar32(), normalized) us = us[i:] return builder.build()
def __init__(self): # Maybe in new versions of PyICU the following # (now commented out) shorthand function is defined. # self.normalizer_nfc = Normalizer2.getNFCInstance() # Since it is not, use the non-shorthand function with the needed parameters self.normalizer_nfc = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.COMPOSE) self.normalizer_nfd = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.DECOMPOSE) self.normalizer_nfkc = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.COMPOSE) self.normalizer_nfkd = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.DECOMPOSE)
def createComponents(_self, fieldName): source = WhitespaceTokenizer() return Analyzer.TokenStreamComponents( source, ICUNormalizer2Filter( source, Normalizer2.getInstance( None, "nfc", UNormalizationMode2.DECOMPOSE)))
def _normalize(normalizer: icu.Normalizer2, bs: bistr) -> bistr: builder = BistrBuilder(bs) current = builder.current while not builder.is_complete: i = builder.position j = i + 1 while j < len(current) and not normalizer.hasBoundaryBefore(current[j]): j += 1 chunk = current[i:j] repl = normalizer.normalize(chunk) if repl == chunk: builder.skip(len(chunk)) else: builder.replace(len(chunk), repl) return builder.build()
def __init__(self, input, normalizer=None): super(ICUNormalizer2Filter, self).__init__(input) self.input = input self.termAtt = self.addAttribute(CharTermAttribute.class_); if normalizer is None: normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.normalizer = normalizer
def __init__(self, input, normalizer=None): super(ICUNormalizer2Filter, self).__init__(input) self.input = input self.termAtt = self.addAttribute(CharTermAttribute.class_) if normalizer is None: normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.normalizer = normalizer
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.assertTrue(normalizer.normalize("Hi There") == u'hi there') a = UnicodeString() normalizer.normalize("Hi There", a) self.assertTrue(a == UnicodeString(u'hi there'))
def gen_boundary_regex(normalizer: icu.Normalizer2) -> str: ranges = [] for cp in range(0x110000): if not normalizer.hasBoundaryBefore(chr(cp)): if ranges and cp == ranges[-1].stop: ranges[-1] = range(ranges[-1].start, cp + 1) else: ranges.append(range(cp, cp + 1)) chunks = ['/.['] for r in ranges: chunks.append(escape(r.start)) if len(r) > 1: chunks.append('-') chunks.append(escape(r.stop - 1)) chunks.append(']*/gsu') return "".join(chunks)
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return self.assertNorm( Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE), u'hi there', "Hi There") self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw", u"äßa\u0301W")
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return self.assertNorm(Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE), u'hi there', "Hi There") self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw", u"äßa\u0301W")
def tokenStream(_self, fieldName, reader): return ICUNormalizer2Filter( WhitespaceTokenizer(Version.LUCENE_CURRENT, reader), Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
def __init__(self, input): normalizer = Normalizer2.getInstance("utr30", "utr30", UNormalizationMode2.COMPOSE) super(ICUFoldingFilter, self).__init__(input, normalizer)
def tokenStream(_self, fieldName, reader): return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader), Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
def createComponents(_self, fieldName): source = WhitespaceTokenizer() return Analyzer.TokenStreamComponents( source, ICUNormalizer2Filter( source, Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE)))
from icu import Normalizer2 composer = Normalizer2.getNFCInstance() decomposer = Normalizer2.getNFDInstance() def compDecomp(orig): composed = composer.normalize(orig) decomposed = decomposer.normalize(orig) print(f"{orig} {composed} {decomposed}") compDecomp('lội')
def to_latin(string, locale=locale): ustring = UnicodeString(string) nfc = Normalizer2.getNFCInstance() ustring = nfc.normalize(ustring) trans = Transliterator.createFromRules( "", "$wb = [^[:Letter:]] ;" # е "$wb { е > ye ;" "[ыq] { е } $wb > e ;" "[уеёыаоэяиюьъiuoeaq] { е > ye ;" "е > e ;" # э "$wb { э > e ;" "[жшцйjwcy] { э > е ;" "э > qe ;" # ы "[жшцйjwcy] { ы > i ;" "ы > q ;" # ё "$wb { ё > yo ;" "[жшцйjwcy] { ё > o ;" "[уеёыаоэяиюьъiuoeaq] { ё > yo ;" "ё > ho ;" # ю "$wb { ю > yu ;" "[жшцйjwcy] { ю > u ;" "[уеёыаоэяиюьъiuoeaq] { ю > yu ;" "ю > hu ;" # я "$wb { я > ya ;" "[жшцйjwcy] { я > a ;" "[уеёыаоэяиюьъiuoeaq] { я > ya ;" "я > ha ;" # Буквосочетание ьо, только в заимствованных "ньо > nyo ;" "льо > lyo ;" "мьо > myo ;" "рьо > ryo ;" # Остальные буквы "а > a ;" "б > b ;" "в > v ;" "г > g ;" "д > d ;" "ж > j ;" "з > z ;" "и > i ;" "й > y ;" "к > k ;" "л > l ;" "м > m ;" "н > n ;" "о > o ;" "п > p ;" "р > r ;" "с > s ;" "т > t ;" "у > u ;" "ф > f ;" "х > x ;" "ц > c ;" "ч > ch ;" "ш > w ;" "щ > wh ;" # Проход с начала ":: Any-Null ;" "[nlmr] { ь } y[aueioq] > ;" "ь > h ;" "[nlmr] { ъ } y[aueioq] > y;" "ъ > ;" # Проход с начала ":: Any-Null ;" "h+ > h ;") ustring = trans.transliterate(ustring) return ustring