def computeScriptFirst(someString): for i in range(len(someString)): c = someString[i] script = unicodedata2.script_cat(c)[0] if script == "Common": continue return script return None
def tokenize_real(self, text): chars = ((unicodedata2.script_cat(c), c) for c in text) tokens = list() for (key, group) in itertools.groupby(chars, operator.itemgetter(0)): if (key[1][0] == 'L' and key[0] not in self.DISCARD_SCRIPTS): cand = ''.join((c[1] for c in group)) if (key[0] in self.JP_SCRIPTS): tokens.extend(self.tiny.tokenize(cand)) else: tokens.append(cand.lower()) return tokens
def tokenize_real(self, text): chars = ((unicodedata2.script_cat(c), c) for c in text) tokens = list() for (key, group) in itertools.groupby(chars, operator.itemgetter(0)): if key[1][0] == "L" and key[0] not in self.DISCARD_SCRIPTS: cand = "".join((c[1] for c in group)) if key[0] in self.JP_SCRIPTS: tokens.extend(self.tiny.tokenize(cand)) else: tokens.append(cand.lower()) return tokens
def script_category(char): """ Returns category of a Unicode character Possible values: default, Cyrillic, Greek, Han, Hiragana """ cat = unicodedata2.script_cat(char)[0] if char == u':': return 'Han' if cat in ['Latin', 'Common']: return 'default' else: return cat