def process_unicode(uni): """Receive unicode string, then return a list of unicode as bi-grammed result. """ normalized = unicodedata.normalize('NFKC', uni) for word in rx_U.findall(normalized): swords = [g.group() for g in pattern.finditer(word)] for sword in swords: if not rx_all.match(sword[0]): yield sword else: yield from bigram(sword, 0)
def process_unicode(uni): """Receive unicode string, then return a list of unicode as bi-grammed result. """ normalized = unicodedata.normalize('NFKC', uni) for word in rx_U.findall(normalized): swords = [g.group() for g in pattern.finditer(word)] for sword in swords: if not rx_all.match(sword[0]): yield sword else: for x in bigram(sword, 0): yield x
def process_unicode_glob(uni): """Receive unicode string, then return a list of unicode as bi-grammed result. Considering globbing. """ normalized = unicodedata.normalize('NFKC', uni) for word in rxGlob_U.findall(normalized): swords = [g.group() for g in pattern_g.finditer(word) if g.group() not in u"*?"] for i, sword in enumerate(swords): if not rx_all.match(sword[0]): yield sword else: if i == len(swords) - 1: limit = 1 else: limit = 0 if len(sword) == 1: bigramed = [sword + u"*"] else: bigramed = bigram(sword, limit) for x in bigramed: yield x