Beispiel #1
0
 def _test_parse(self):
     triedict = TrieDict()
     patterns = ["abcd", "bcd", "c"]
     for i,s in enumerate(patterns):
         triedict.add_pattern(s, i+1)
     print patterns
     s = "a abcd c bcd"
     print "".join([str(i % 10) for i in xrange(len(s))])
     print s
     try:
         print triedict.parse(s)
         assert False
     except ValueError, e:
         assert True
Beispiel #2
0
 def test_parse_with_spaces(self):
     triedict = TrieDict()
     patterns = ["this is cool", "cool", "is is cool"]
     for i, s in enumerate(patterns):
         triedict.add_pattern(s, i+1)
     #    0         1         2         3
     #    01234567890123456789012345678901
     s = "yo this is cool is is cool cool!"
     #       this is cool
     #               cool       cool cool
     #                    is is cool
     triedict.generate_suffix_links()
     matched = triedict.parse(s, bound_chars=" !.;,")
     matched.sort(key=lambda x: (x[2],x[0]))
     self.assertEqual(len(matched), 5)
     self.assertEqual(matched[0][0], "cool")
     self.assertEqual(matched[0][2], 14)
     self.assertEqual(matched[1][0], "this is cool")
     self.assertEqual(matched[1][2], 14)
     self.assertEqual(matched[2][0], "cool")
     self.assertEqual(matched[2][2], 25)
     self.assertEqual(matched[3][0], "is is cool")
     self.assertEqual(matched[3][2], 25)
     self.assertEqual(matched[4][0], "cool")
     self.assertEqual(matched[4][2], 30)
Beispiel #3
0
 def test_unicode(self):
     triedict = TrieDict()
     s0 = u"aaa aaa"
     a_uc = unichr(257)
     s1 = a_uc+a_uc+a_uc+" "+a_uc+a_uc+a_uc
     s2 = s0 + " " + s1
     triedict.add_pattern(s0)
     triedict.add_pattern(s1)
     triedict.generate_suffix_links()
     matched = triedict.parse(s2)
Beispiel #4
0
    def test_parse_with_bound_chars(self):
        triedict = TrieDict()
        patterns = ["this", "this0", "word", "dude"]
        for i, s in enumerate(patterns):
            triedict.add_pattern(s, i+1)

        #    0         1         2         3
        #    0123456789012345678901234567890123456
        s = "this word...has words dudes, or dude!"
        #    this word                       dude
        #       3    8          9     5         5
        #    this word       word  dude      dude
        triedict.generate_suffix_links()
        matched = triedict.parse(s, bound_chars=" !.;,")
        matched.sort(key=lambda x: x[2])
        self.assertEqual(len(matched), 3)
        self.assertEqual(matched[0][0], "this")
        self.assertEqual(matched[0][2], 3)
        self.assertEqual(matched[1][0], "word")
        self.assertEqual(matched[1][2], 8)
        self.assertEqual(matched[2][0], "dude")
        self.assertEqual(matched[2][2], 35)

        matched = triedict.parse(s, bound_chars=None)
        matched.sort(key=lambda x: x[2])
        self.assertEqual(len(matched), 5)
        self.assertEqual(matched[0][0], "this")
        self.assertEqual(matched[0][2], 3)
        self.assertEqual(matched[1][0], "word")
        self.assertEqual(matched[1][2], 8)
        self.assertEqual(matched[2][0], "word")
        self.assertEqual(matched[2][2], 19)
        self.assertEqual(matched[3][0], "dude")
        self.assertEqual(matched[3][2], 25)
        self.assertEqual(matched[4][0], "dude")
        self.assertEqual(matched[4][2], 35)