def _test_parse(self): triedict = TrieDict() patterns = ["abcd", "bcd", "c"] for i,s in enumerate(patterns): triedict.add_pattern(s, i+1) print patterns s = "a abcd c bcd" print "".join([str(i % 10) for i in xrange(len(s))]) print s try: print triedict.parse(s) assert False except ValueError, e: assert True
def test_parse_with_spaces(self): triedict = TrieDict() patterns = ["this is cool", "cool", "is is cool"] for i, s in enumerate(patterns): triedict.add_pattern(s, i+1) # 0 1 2 3 # 01234567890123456789012345678901 s = "yo this is cool is is cool cool!" # this is cool # cool cool cool # is is cool triedict.generate_suffix_links() matched = triedict.parse(s, bound_chars=" !.;,") matched.sort(key=lambda x: (x[2],x[0])) self.assertEqual(len(matched), 5) self.assertEqual(matched[0][0], "cool") self.assertEqual(matched[0][2], 14) self.assertEqual(matched[1][0], "this is cool") self.assertEqual(matched[1][2], 14) self.assertEqual(matched[2][0], "cool") self.assertEqual(matched[2][2], 25) self.assertEqual(matched[3][0], "is is cool") self.assertEqual(matched[3][2], 25) self.assertEqual(matched[4][0], "cool") self.assertEqual(matched[4][2], 30)
def test_unicode(self): triedict = TrieDict() s0 = u"aaa aaa" a_uc = unichr(257) s1 = a_uc+a_uc+a_uc+" "+a_uc+a_uc+a_uc s2 = s0 + " " + s1 triedict.add_pattern(s0) triedict.add_pattern(s1) triedict.generate_suffix_links() matched = triedict.parse(s2)
def test_parse_with_bound_chars(self): triedict = TrieDict() patterns = ["this", "this0", "word", "dude"] for i, s in enumerate(patterns): triedict.add_pattern(s, i+1) # 0 1 2 3 # 0123456789012345678901234567890123456 s = "this word...has words dudes, or dude!" # this word dude # 3 8 9 5 5 # this word word dude dude triedict.generate_suffix_links() matched = triedict.parse(s, bound_chars=" !.;,") matched.sort(key=lambda x: x[2]) self.assertEqual(len(matched), 3) self.assertEqual(matched[0][0], "this") self.assertEqual(matched[0][2], 3) self.assertEqual(matched[1][0], "word") self.assertEqual(matched[1][2], 8) self.assertEqual(matched[2][0], "dude") self.assertEqual(matched[2][2], 35) matched = triedict.parse(s, bound_chars=None) matched.sort(key=lambda x: x[2]) self.assertEqual(len(matched), 5) self.assertEqual(matched[0][0], "this") self.assertEqual(matched[0][2], 3) self.assertEqual(matched[1][0], "word") self.assertEqual(matched[1][2], 8) self.assertEqual(matched[2][0], "word") self.assertEqual(matched[2][2], 19) self.assertEqual(matched[3][0], "dude") self.assertEqual(matched[3][2], 25) self.assertEqual(matched[4][0], "dude") self.assertEqual(matched[4][2], 35)