Esempio n. 1
0
    def test_min_df(self):
        ff = preprocess.FrequencyFilter(min_df=.5)
        processed = ff(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, size * .5, size)
        self.assertEqual(len(processed.used_preprocessor.preprocessors), 2)

        ff = preprocess.FrequencyFilter(min_df=2)
        processed = ff(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, 2, size)
        self.assertEqual(len(processed.used_preprocessor.preprocessors), 2)
Esempio n. 2
0
    def test_max_df(self):
        ff = preprocess.FrequencyFilter(max_df=.3)
        size = len(self.corpus.documents)

        corpus = ff(self.corpus)
        self.assertFrequencyRange(corpus, 1, size * .3)
        self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

        ff = preprocess.FrequencyFilter(max_df=2)
        corpus = ff(self.corpus)
        self.assertFrequencyRange(corpus, 1, 2)
        self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
Esempio n. 3
0
    def test_max_df(self):
        ff = preprocess.FrequencyFilter(max_df=.3)
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                         filters=[ff])
        size = len(self.corpus.documents)

        corpus = p(self.corpus)
        self.assertFrequencyRange(corpus, 1, size * .3)

        ff.max_df = 2
        corpus = p(self.corpus)
        self.assertFrequencyRange(corpus, 1, 2)
Esempio n. 4
0
    def test_min_df(self):
        ff = preprocess.FrequencyFilter(min_df=.5)
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                         filters=[ff])
        processed = p(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, size * .5, size)

        ff.min_df = 2
        processed = p(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, 2, size)
Esempio n. 5
0
    def test_str(self):
        f = preprocess.StopwordsFilter('french')
        self.assertIn('french', str(f).lower())

        f = preprocess.FrequencyFilter(keep_n=None)
        self.assertNotIn('none', str(f).lower())
        f.max_df = .5
        self.assertIn('0.5', str(f))
        f.max_df = .2
        self.assertIn('0.2', str(f))

        f = preprocess.LexiconFilter()
        self.assertIn('lexicon', str(f).lower())
Esempio n. 6
0
    def test_pickle_corpus(self):
        """
        Corpus must be picklable (for save data widget)
        gh-590
        """
        c = Corpus.from_file('book-excerpts')

        # it must also work with preprocessed corpus
        self.pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.WordPunctTokenizer(),
            preprocess.SnowballStemmer(),
            preprocess.FrequencyFilter(),
            preprocess.StopwordsFilter()
        ]
        for pp in self.pp_list:
            c = pp(c)
        pickle.dumps(c)
Esempio n. 7
0
 def test_keep_n(self):
     ff = preprocess.FrequencyFilter(keep_n=5)
     p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                      filters=[ff])
     processed = p(self.corpus)
     self.assertEqual(len(set(itertools.chain(*processed.tokens))), 5)