コード例 #1
0
ファイル: _icu.py プロジェクト: zxlzr/bistring
def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr:
    builder = BistrBuilder(bs)
    us = icu.UnicodeString(bs.modified)
    offset = 0
    while not builder.is_complete:
        i = normalizer.spanQuickCheckYes(us)
        builder.skip(us.countChar32(0, i))
        if builder.is_complete:
            break
        us = us[i:]

        i = 0
        while i < len(us):
            if us.charAt(i) & 0xFC00 == 0xD800:
                i += 1
            i += 1
            if normalizer.hasBoundaryBefore(chr(us.char32At(i))):
                break

        chunk = us[:i]
        normalized = str(normalizer.normalize(chunk))
        builder.replace(chunk.countChar32(), normalized)
        us = us[i:]

    return builder.build()
コード例 #2
0
    def __init__(self):
        # Maybe in new versions of PyICU the following
        # (now commented out) shorthand function is defined.
        # self.normalizer_nfc = Normalizer2.getNFCInstance()

        # Since it is not, use the non-shorthand function with the needed parameters
        self.normalizer_nfc = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.COMPOSE)
        self.normalizer_nfd = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.DECOMPOSE)
        self.normalizer_nfkc = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.COMPOSE)
        self.normalizer_nfkd = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.DECOMPOSE)
コード例 #3
0
    def __init__(self):
        # Maybe in new versions of PyICU the following
        # (now commented out) shorthand function is defined.
        # self.normalizer_nfc = Normalizer2.getNFCInstance()

        # Since it is not, use the non-shorthand function with the needed parameters
        self.normalizer_nfc = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.COMPOSE)
        self.normalizer_nfd = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.DECOMPOSE)
        self.normalizer_nfkc = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.COMPOSE)
        self.normalizer_nfkd = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.DECOMPOSE)
コード例 #4
0
 def createComponents(_self, fieldName):
     source = WhitespaceTokenizer()
     return Analyzer.TokenStreamComponents(
         source,
         ICUNormalizer2Filter(
             source,
             Normalizer2.getInstance(
                 None, "nfc", UNormalizationMode2.DECOMPOSE)))
コード例 #5
0
ファイル: _icu.py プロジェクト: yazici/bistring
def _normalize(normalizer: icu.Normalizer2, bs: bistr) -> bistr:
    builder = BistrBuilder(bs)
    current = builder.current

    while not builder.is_complete:
        i = builder.position
        j = i + 1
        while j < len(current) and not normalizer.hasBoundaryBefore(current[j]):
            j += 1

        chunk = current[i:j]
        repl = normalizer.normalize(chunk)
        if repl == chunk:
            builder.skip(len(chunk))
        else:
            builder.replace(len(chunk), repl)

    return builder.build()
コード例 #6
0
    def __init__(self, input, normalizer=None):
        super(ICUNormalizer2Filter, self).__init__(input)

        self.input = input
        self.termAtt = self.addAttribute(CharTermAttribute.class_);

        if normalizer is None:
            normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE)
        self.normalizer = normalizer
コード例 #7
0
    def __init__(self, input, normalizer=None):
        super(ICUNormalizer2Filter, self).__init__(input)

        self.input = input
        self.termAtt = self.addAttribute(CharTermAttribute.class_)

        if normalizer is None:
            normalizer = Normalizer2.getInstance(None, "nfkc_cf",
                                                 UNormalizationMode2.COMPOSE)
        self.normalizer = normalizer
コード例 #8
0
ファイル: test_Normalizer.py プロジェクト: ezmiller/pyicu
    def testNormalize(self):

        try:
            from icu import Normalizer2
        except ImportError:
            return

        normalizer = Normalizer2.getInstance(None, "nfkc_cf",
                                             UNormalizationMode2.COMPOSE)

        self.assertTrue(normalizer.normalize("Hi There") == u'hi there')

        a = UnicodeString()
        normalizer.normalize("Hi There", a)
        self.assertTrue(a == UnicodeString(u'hi there'))
コード例 #9
0
    def testNormalize(self):

        try:
            from icu import Normalizer2
        except ImportError:
            return

        normalizer = Normalizer2.getInstance(None, "nfkc_cf",
                                             UNormalizationMode2.COMPOSE)

        self.assertTrue(normalizer.normalize("Hi There") == u'hi there')

        a = UnicodeString()
        normalizer.normalize("Hi There", a)
        self.assertTrue(a == UnicodeString(u'hi there'))
コード例 #10
0
ファイル: generate_unicode.py プロジェクト: yazici/bistring
def gen_boundary_regex(normalizer: icu.Normalizer2) -> str:
    ranges = []
    for cp in range(0x110000):
        if not normalizer.hasBoundaryBefore(chr(cp)):
            if ranges and cp == ranges[-1].stop:
                ranges[-1] = range(ranges[-1].start, cp + 1)
            else:
                ranges.append(range(cp, cp + 1))

    chunks = ['/.[']
    for r in ranges:
        chunks.append(escape(r.start))
        if len(r) > 1:
            chunks.append('-')
            chunks.append(escape(r.stop - 1))
    chunks.append(']*/gsu')

    return "".join(chunks)
コード例 #11
0
    def testNormalize(self):

        try:
            from icu import Normalizer2
        except ImportError:
            return

        self.assertNorm(
            Normalizer2.getInstance(None, "nfkc_cf",
                                    UNormalizationMode2.COMPOSE), u'hi there',
            "Hi There")
        self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W",
                        u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W",
                        u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw",
                        u"äßa\u0301W")
コード例 #12
0
ファイル: test_Normalizer.py プロジェクト: sciyoshi/pyicu
    def testNormalize(self):

        try:
            from icu import Normalizer2
        except ImportError:
            return

        self.assertNorm(Normalizer2.getInstance(None, "nfkc_cf",
                                                UNormalizationMode2.COMPOSE),
                        u'hi there', "Hi There")
        self.assertNorm(Normalizer2.getNFCInstance(),
                        u"äßáW", u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFDInstance(),
                        u"a\u0308ßa\u0301W", u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFKCInstance(),
                        u"äßáW", u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFKDInstance(),
                        u"a\u0308ßa\u0301W", u"äßa\u0301W")
        self.assertNorm(Normalizer2.getNFKCCasefoldInstance(),
                        u"ässáw", u"äßa\u0301W")
コード例 #13
0
 def tokenStream(_self, fieldName, reader):
     return ICUNormalizer2Filter(
         WhitespaceTokenizer(Version.LUCENE_CURRENT, reader),
         Normalizer2.getInstance(None, "nfc",
                                 UNormalizationMode2.DECOMPOSE))
コード例 #14
0
    def __init__(self, input):

        normalizer = Normalizer2.getInstance("utr30", "utr30",
                                             UNormalizationMode2.COMPOSE)
        super(ICUFoldingFilter, self).__init__(input, normalizer)
 def tokenStream(_self, fieldName, reader):
     return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader),
                                 Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
コード例 #16
0
 def createComponents(_self, fieldName):
     source = WhitespaceTokenizer()
     return Analyzer.TokenStreamComponents(
         source, ICUNormalizer2Filter(
             source,
             Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE)))
コード例 #17
0
ファイル: sample.py プロジェクト: iharh/prj
from icu import Normalizer2

composer = Normalizer2.getNFCInstance()
decomposer = Normalizer2.getNFDInstance()

def compDecomp(orig):
    composed = composer.normalize(orig)
    decomposed = decomposer.normalize(orig)
    print(f"{orig} {composed} {decomposed}")

compDecomp('lội')
コード例 #18
0
ファイル: translit.py プロジェクト: ayum/ayum.translit
def to_latin(string, locale=locale):
    ustring = UnicodeString(string)
    nfc = Normalizer2.getNFCInstance()
    ustring = nfc.normalize(ustring)

    trans = Transliterator.createFromRules(
        "",
        "$wb = [^[:Letter:]] ;"
        # е
        "$wb { е > ye ;"
        "[ыq] { е } $wb > e ;"
        "[уеёыаоэяиюьъiuoeaq] { е > ye ;"
        "е > e ;"
        # э
        "$wb { э > e ;"
        "[жшцйjwcy] { э > е ;"
        "э > qe ;"
        # ы
        "[жшцйjwcy] { ы > i ;"
        "ы > q ;"
        # ё
        "$wb { ё > yo ;"
        "[жшцйjwcy] { ё > o ;"
        "[уеёыаоэяиюьъiuoeaq] { ё > yo ;"
        "ё > ho ;"
        # ю
        "$wb { ю > yu ;"
        "[жшцйjwcy] { ю > u ;"
        "[уеёыаоэяиюьъiuoeaq] { ю > yu ;"
        "ю > hu ;"
        # я
        "$wb { я > ya ;"
        "[жшцйjwcy] { я > a ;"
        "[уеёыаоэяиюьъiuoeaq] { я > ya ;"
        "я > ha ;"
        # Буквосочетание ьо,  только в заимствованных
        "ньо > nyo ;"
        "льо > lyo ;"
        "мьо > myo ;"
        "рьо > ryo ;"
        # Остальные буквы
        "а > a ;"
        "б > b ;"
        "в > v ;"
        "г > g ;"
        "д > d ;"
        "ж > j ;"
        "з > z ;"
        "и > i ;"
        "й > y ;"
        "к > k ;"
        "л > l ;"
        "м > m ;"
        "н > n ;"
        "о > o ;"
        "п > p ;"
        "р > r ;"
        "с > s ;"
        "т > t ;"
        "у > u ;"
        "ф > f ;"
        "х > x ;"
        "ц > c ;"
        "ч > ch ;"
        "ш > w ;"
        "щ > wh ;"
        # Проход с начала
        ":: Any-Null ;"
        "[nlmr] { ь } y[aueioq] > ;"
        "ь > h ;"
        "[nlmr] { ъ } y[aueioq] > y;"
        "ъ > ;"
        # Проход с начала
        ":: Any-Null ;"
        "h+ > h ;")
    ustring = trans.transliterate(ustring)
    return ustring