def test_words_tokenizes_the_sentence_correctly(self):
     t = TextClassificationInstance("This is a sentence.", None)
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']}
     t = TextClassificationInstance("This isn't a sentence.", None)
     assert t.words() == {'words': ['this', 'is', "n't", 'a', 'sentence', '.']}
     t = TextClassificationInstance("And, I have commas.", None)
     assert t.words() == {'words': ['and', ',', 'i', 'have', 'commas', '.']}
Ejemplo n.º 2
0
 def test_words_tokenizes_the_sentence_correctly(self):
     t = TextClassificationInstance("This is a sentence.", None)
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']}
     TextInstance.tokenizer = tokenizers['characters'](Params({}))
     assert t.words() == {
         'words': [
             'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e',
             'n', 't', 'e', 'n', 'c', 'e', '.'
         ]
     }
     TextInstance.tokenizer = tokenizers['words and characters'](Params({}))
     assert t.words() == {
         'words': ['this', 'is', 'a', 'sentence', '.'],
         'characters': [
             't', 'h', 'i', 's', 'i', 's', 'a', 's', 'e', 'n', 't', 'e',
             'n', 'c', 'e', '.'
         ]
     }