def test_WordAnalyser_unigrams_with_stopwords(self): ngram_range = (1, 1) WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range) doc = 'Some test words to ignore safely' expected_ngrams = ['test', 'words', 'ignore', 'safely'] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_unigrams_with_punctuation(self): ngram_range = (1, 1) WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range) doc = "Some test words, to ignore except-hyphens but including someone's ownership" expected_ngrams = [ 'test', 'words', 'ignore', 'except-hyphens', 'ownership' ] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_ngrams_dont_cross_punctuation_or_stop_words(self): ngram_range = (1, 3) WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range) doc = "Some test words, except-hyphens metal but someone's metal fish bucket" expected_ngrams = [ 'test', 'words', 'except-hyphens', 'metal', 'metal', 'fish', 'bucket', 'test words', 'except-hyphens metal', 'metal fish', 'fish bucket', 'metal fish bucket' ] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)