def test_parse_with_spaces(self): triedict = TrieDict() patterns = ["this is cool", "cool", "is is cool"] for i, s in enumerate(patterns): triedict.add_pattern(s, i+1) # 0 1 2 3 # 01234567890123456789012345678901 s = "yo this is cool is is cool cool!" # this is cool # cool cool cool # is is cool triedict.generate_suffix_links() matched = triedict.parse(s, bound_chars=" !.;,") matched.sort(key=lambda x: (x[2],x[0])) self.assertEqual(len(matched), 5) self.assertEqual(matched[0][0], "cool") self.assertEqual(matched[0][2], 14) self.assertEqual(matched[1][0], "this is cool") self.assertEqual(matched[1][2], 14) self.assertEqual(matched[2][0], "cool") self.assertEqual(matched[2][2], 25) self.assertEqual(matched[3][0], "is is cool") self.assertEqual(matched[3][2], 25) self.assertEqual(matched[4][0], "cool") self.assertEqual(matched[4][2], 30)
def _test_generate_suffix_pointers(self): print "test generate_suffix_pointers..." triedict = TrieDict() triedict.add_pattern("abcd") triedict.add_pattern("bcd") triedict.add_pattern("c") triedict.generate_suffix_links() print triedict print triedict.to_string()
def test_unicode(self): triedict = TrieDict() s0 = u"aaa aaa" a_uc = unichr(257) s1 = a_uc+a_uc+a_uc+" "+a_uc+a_uc+a_uc s2 = s0 + " " + s1 triedict.add_pattern(s0) triedict.add_pattern(s1) triedict.generate_suffix_links() matched = triedict.parse(s2)
def _test_persist(self): triedict = TrieDict() triedict.add_pattern("blaaaa") triedict.add_pattern("blauu") triedict.generate_suffix_links() print triedict print triedict.to_string() triedict.save("test.triedict") triedict2 = TrieDict.load("test.triedict") print triedict2 print triedict2.to_string()
def test_parse_with_bound_chars(self): triedict = TrieDict() patterns = ["this", "this0", "word", "dude"] for i, s in enumerate(patterns): triedict.add_pattern(s, i+1) # 0 1 2 3 # 0123456789012345678901234567890123456 s = "this word...has words dudes, or dude!" # this word dude # 3 8 9 5 5 # this word word dude dude triedict.generate_suffix_links() matched = triedict.parse(s, bound_chars=" !.;,") matched.sort(key=lambda x: x[2]) self.assertEqual(len(matched), 3) self.assertEqual(matched[0][0], "this") self.assertEqual(matched[0][2], 3) self.assertEqual(matched[1][0], "word") self.assertEqual(matched[1][2], 8) self.assertEqual(matched[2][0], "dude") self.assertEqual(matched[2][2], 35) matched = triedict.parse(s, bound_chars=None) matched.sort(key=lambda x: x[2]) self.assertEqual(len(matched), 5) self.assertEqual(matched[0][0], "this") self.assertEqual(matched[0][2], 3) self.assertEqual(matched[1][0], "word") self.assertEqual(matched[1][2], 8) self.assertEqual(matched[2][0], "word") self.assertEqual(matched[2][2], 19) self.assertEqual(matched[3][0], "dude") self.assertEqual(matched[3][2], 25) self.assertEqual(matched[4][0], "dude") self.assertEqual(matched[4][2], 35)