def test_ignore_case_replace_longest(self): if sys.version_info.major < 3: return trie = Trie(ignore_case=True) ids = {w: trie.insert(w) for w in [u"aİİ", u"aai̇", u"aai̇bİ"]} replaced = { ids[u"aİİ"]: u"a", ids[u"aai̇"]: u"b", ids[u"aai̇bİ"]: u"c", } res = trie.replace_longest(u"aaİ aai̇bİaa", lambda x, start, end: replaced[x]) self.assertEqual(res, u"b caa") sep = set([ord(" ")]) # space as seperator res = trie.replace_longest(u"aaİ aai̇bİaa", lambda x, start, end: replaced[x], sep) self.assertEqual(res, u"b aai̇bİaa")
def test_pickle_trie(self): trie = Trie(ignore_case=True) ids = {w: trie.insert(w) for w in [u"aİİ", u"aai̇", u"aai̇bİ"]} with open("trie.pkl", "wb") as fo: pickle.dump(trie, fo) with open("trie.pkl", "rb") as fi: trie = pickle.load(fi) replaced = { ids[u"aİİ"]: u"a", ids[u"aai̇"]: u"b", ids[u"aai̇bİ"]: u"c", } res = trie.replace_longest(u"aaİ aai̇bİaa", lambda x, start, end: replaced[x]) self.assertEqual(res, u"b caa") sep = set([ord(" ")]) # space as seperator res = trie.replace_longest(u"aaİ aai̇bİaa", lambda x, start, end: replaced[x], sep) self.assertEqual(res, u"b aai̇bİaa")
def test_replace_longest(self): trie = Trie() ids = { w: trie.insert(w) for w in [u"New York", u"New", u"York", u"York City", u"City", u"City is"] } replaced = { ids[u"New York"]: u"Beijing", ids[u"New"]: u"Old", ids[u"York"]: u"Yark", ids[u"York City"]: u"Yerk Town", ids[u"City"]: u"Country", ids[u"City is"]: u"Province are" } res = trie.replace_longest(u"New York City isA", lambda x, start, end: replaced[x]) self.assertEqual(res, u"Beijing Province areA") sep = set([ord(" ")]) # space as seperator res = trie.replace_longest(u"New York City isA", lambda x, start, end: replaced[x], sep) self.assertEqual(res, u"Beijing Country isA")
def test_replace_words(self): dir_ = os.path.dirname(__file__) trie = Trie() ids = [] with open(os.path.join(dir_, "../bench/words.txt")) as fi: for l in fi: l = l.strip() if isinstance(l, bytes): l = l.decode("utf8") if len(l) > 0: ids.append(trie.insert(l)) with open(os.path.join(dir_, "../bench/words.txt")) as fi: txt = fi.read() if isinstance(txt, bytes): txt = txt.decode("utf8") sep = set([ord("\n")]) ret = trie.replace_longest(txt, lambda v, start, end: str(v), sep).strip() self.assertEqual(ret, "\n".join([str(i) for i in ids]))