Beispiel #1
0
    def test_stopwords(self):
        text = 'this is a nice house'

        tok = BaseTokenizer(stop_words='english')
        self.assertListEqual(tok.tokenize(text), ['nice', 'house'])

        tok = BaseTokenizer(stop_words=['is', 'a'])
        self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house'])

        try:
            BaseTokenizer(stop_words='vietnamese')
        except ValueError:
            assert True
Beispiel #2
0
class TestBaseTokenizer(unittest.TestCase):
    def setUp(self):
        self.tok = BaseTokenizer()

    def test_init(self):
        self.assertEqual(self.tok.sep, ' ')

    def test_tokenize(self):
        tokens = self.tok.tokenize('a b c')
        self.assertListEqual(tokens, ['a', 'b', 'c'])

    def test_batch_tokenize(self):
        token_list = self.tok.batch_tokenize(['a b c', 'd e f'])
        self.assertListEqual(token_list, [['a', 'b', 'c'], ['d', 'e', 'f']])

    def test_default_rules(self):
        tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES)
        token_list = tok.tokenize('<t>a</t> B |{ C ]?&$  d123 E')
        self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e'])

    def test_stopwords(self):
        text = 'this is a nice house'

        tok = BaseTokenizer(stop_words='english')
        self.assertListEqual(tok.tokenize(text), ['nice', 'house'])

        tok = BaseTokenizer(stop_words=['is', 'a'])
        self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house'])

        try:
            BaseTokenizer(stop_words='vietnamese')
        except ValueError:
            assert True
Beispiel #3
0
 def test_default_rules(self):
     tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES)
     token_list = tok.tokenize('<t>a</t> B |{ C ]?&$  d123 E')
     self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e'])