Beispiel #1
0
 def test_parse_with_spaces(self):
     triedict = TrieDict()
     patterns = ["this is cool", "cool", "is is cool"]
     for i, s in enumerate(patterns):
         triedict.add_pattern(s, i+1)
     #    0         1         2         3
     #    01234567890123456789012345678901
     s = "yo this is cool is is cool cool!"
     #       this is cool
     #               cool       cool cool
     #                    is is cool
     triedict.generate_suffix_links()
     matched = triedict.parse(s, bound_chars=" !.;,")
     matched.sort(key=lambda x: (x[2],x[0]))
     self.assertEqual(len(matched), 5)
     self.assertEqual(matched[0][0], "cool")
     self.assertEqual(matched[0][2], 14)
     self.assertEqual(matched[1][0], "this is cool")
     self.assertEqual(matched[1][2], 14)
     self.assertEqual(matched[2][0], "cool")
     self.assertEqual(matched[2][2], 25)
     self.assertEqual(matched[3][0], "is is cool")
     self.assertEqual(matched[3][2], 25)
     self.assertEqual(matched[4][0], "cool")
     self.assertEqual(matched[4][2], 30)
Beispiel #2
0
 def _test_generate_suffix_pointers(self):
     print "test generate_suffix_pointers..."
     triedict = TrieDict()
     triedict.add_pattern("abcd")
     triedict.add_pattern("bcd")
     triedict.add_pattern("c")
     triedict.generate_suffix_links()
     print triedict
     print triedict.to_string()
Beispiel #3
0
 def test_unicode(self):
     triedict = TrieDict()
     s0 = u"aaa aaa"
     a_uc = unichr(257)
     s1 = a_uc+a_uc+a_uc+" "+a_uc+a_uc+a_uc
     s2 = s0 + " " + s1
     triedict.add_pattern(s0)
     triedict.add_pattern(s1)
     triedict.generate_suffix_links()
     matched = triedict.parse(s2)
Beispiel #4
0
    def _test_persist(self):
        triedict = TrieDict()
        triedict.add_pattern("blaaaa")
        triedict.add_pattern("blauu")
        triedict.generate_suffix_links()
        print triedict
        print triedict.to_string()

        triedict.save("test.triedict")

        triedict2 = TrieDict.load("test.triedict")
        print triedict2
        print triedict2.to_string()
Beispiel #5
0
    def test_parse_with_bound_chars(self):
        triedict = TrieDict()
        patterns = ["this", "this0", "word", "dude"]
        for i, s in enumerate(patterns):
            triedict.add_pattern(s, i+1)

        #    0         1         2         3
        #    0123456789012345678901234567890123456
        s = "this word...has words dudes, or dude!"
        #    this word                       dude
        #       3    8          9     5         5
        #    this word       word  dude      dude
        triedict.generate_suffix_links()
        matched = triedict.parse(s, bound_chars=" !.;,")
        matched.sort(key=lambda x: x[2])
        self.assertEqual(len(matched), 3)
        self.assertEqual(matched[0][0], "this")
        self.assertEqual(matched[0][2], 3)
        self.assertEqual(matched[1][0], "word")
        self.assertEqual(matched[1][2], 8)
        self.assertEqual(matched[2][0], "dude")
        self.assertEqual(matched[2][2], 35)

        matched = triedict.parse(s, bound_chars=None)
        matched.sort(key=lambda x: x[2])
        self.assertEqual(len(matched), 5)
        self.assertEqual(matched[0][0], "this")
        self.assertEqual(matched[0][2], 3)
        self.assertEqual(matched[1][0], "word")
        self.assertEqual(matched[1][2], 8)
        self.assertEqual(matched[2][0], "word")
        self.assertEqual(matched[2][2], 19)
        self.assertEqual(matched[3][0], "dude")
        self.assertEqual(matched[3][2], 25)
        self.assertEqual(matched[4][0], "dude")
        self.assertEqual(matched[4][2], 35)