def computeScriptFirst(someString):
    for i in range(len(someString)):
        c = someString[i]
        script = unicodedata2.script_cat(c)[0]
        if script == "Common":
            continue

        return script

    return None
Beispiel #2
0
 def tokenize_real(self, text):
     chars = ((unicodedata2.script_cat(c), c) for c in text)
     tokens = list()
     for (key, group) in itertools.groupby(chars, operator.itemgetter(0)):
         if (key[1][0] == 'L' and key[0] not in self.DISCARD_SCRIPTS):
             cand = ''.join((c[1] for c in group))
             if (key[0] in self.JP_SCRIPTS):
                 tokens.extend(self.tiny.tokenize(cand))
             else:
                 tokens.append(cand.lower())
     return tokens
Beispiel #3
0
 def tokenize_real(self, text):
     chars = ((unicodedata2.script_cat(c), c) for c in text)
     tokens = list()
     for (key, group) in itertools.groupby(chars, operator.itemgetter(0)):
         if key[1][0] == "L" and key[0] not in self.DISCARD_SCRIPTS:
             cand = "".join((c[1] for c in group))
             if key[0] in self.JP_SCRIPTS:
                 tokens.extend(self.tiny.tokenize(cand))
             else:
                 tokens.append(cand.lower())
     return tokens
def script_category(char):
    """
    Returns category of a Unicode character
    Possible values:
        default, Cyrillic, Greek, Han, Hiragana
    """
    cat = unicodedata2.script_cat(char)[0]
    if char == u':':
        return 'Han'
    if cat in ['Latin', 'Common']:
        return 'default'
    else:
        return cat