def test_min_df(self): ff = preprocess.FrequencyFilter(min_df=.5) processed = ff(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, size * .5, size) self.assertEqual(len(processed.used_preprocessor.preprocessors), 2) ff = preprocess.FrequencyFilter(min_df=2) processed = ff(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, 2, size) self.assertEqual(len(processed.used_preprocessor.preprocessors), 2)
def test_max_df(self): ff = preprocess.FrequencyFilter(max_df=.3) size = len(self.corpus.documents) corpus = ff(self.corpus) self.assertFrequencyRange(corpus, 1, size * .3) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) ff = preprocess.FrequencyFilter(max_df=2) corpus = ff(self.corpus) self.assertFrequencyRange(corpus, 1, 2) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_max_df(self): ff = preprocess.FrequencyFilter(max_df=.3) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) size = len(self.corpus.documents) corpus = p(self.corpus) self.assertFrequencyRange(corpus, 1, size * .3) ff.max_df = 2 corpus = p(self.corpus) self.assertFrequencyRange(corpus, 1, 2)
def test_min_df(self): ff = preprocess.FrequencyFilter(min_df=.5) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) processed = p(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, size * .5, size) ff.min_df = 2 processed = p(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, 2, size)
def test_str(self): f = preprocess.StopwordsFilter('french') self.assertIn('french', str(f).lower()) f = preprocess.FrequencyFilter(keep_n=None) self.assertNotIn('none', str(f).lower()) f.max_df = .5 self.assertIn('0.5', str(f)) f.max_df = .2 self.assertIn('0.2', str(f)) f = preprocess.LexiconFilter() self.assertIn('lexicon', str(f).lower())
def test_pickle_corpus(self): """ Corpus must be picklable (for save data widget) gh-590 """ c = Corpus.from_file('book-excerpts') # it must also work with preprocessed corpus self.pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.FrequencyFilter(), preprocess.StopwordsFilter() ] for pp in self.pp_list: c = pp(c) pickle.dumps(c)
def test_keep_n(self): ff = preprocess.FrequencyFilter(keep_n=5) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) processed = p(self.corpus) self.assertEqual(len(set(itertools.chain(*processed.tokens))), 5)