Ejemplo n.º 1
0
 def test_spacy_whitespace_tokenizer(self):
     tokenizer = WordTokenizer(tokenizer="spacy-whitespace")
     tokenized = tokenizer.tokenize(
         "(1999). & P., W. The Control of Discrete Event Systems.")
     assert tokenized == [
         "(1999).",
         "&",
         "P.,",
         "W.",
         "The",
         "Control",
         "of",
         "Discrete",
         "Event",
         "Systems.",
     ]
Ejemplo n.º 2
0
 def test_other_tokenizer(self):
     tokenizer = WordTokenizer(tokenizer="nltk")
     assert tokenizer.tokenize("First string") is None
Ejemplo n.º 3
0
    def test_sample_word_tokenization(self):
        sample_sentence = "I like big apple."
        tokenizer = WordTokenizer()
        tokens = tokenizer.tokenize(sample_sentence)

        assert tokens == ["I", "like", "big", "apple", "."]
Ejemplo n.º 4
0
    def test_sample_apostrophe_tokenization(self):
        sample_sentence = "I don't like apples."
        tokenizer = WordTokenizer()
        tokens = tokenizer.tokenize(sample_sentence)

        assert tokens == ["I", "do", "n't", "like", "apples", "."]