def test_spacy_whitespace_tokenizer(self): tokenizer = WordTokenizer(tokenizer="spacy-whitespace") tokenized = tokenizer.tokenize( "(1999). & P., W. The Control of Discrete Event Systems.") assert tokenized == [ "(1999).", "&", "P.,", "W.", "The", "Control", "of", "Discrete", "Event", "Systems.", ]
def test_other_tokenizer(self): tokenizer = WordTokenizer(tokenizer="nltk") assert tokenizer.tokenize("First string") is None
def test_sample_word_tokenization(self): sample_sentence = "I like big apple." tokenizer = WordTokenizer() tokens = tokenizer.tokenize(sample_sentence) assert tokens == ["I", "like", "big", "apple", "."]
def test_sample_apostrophe_tokenization(self): sample_sentence = "I don't like apples." tokenizer = WordTokenizer() tokens = tokenizer.tokenize(sample_sentence) assert tokens == ["I", "do", "n't", "like", "apples", "."]