Esempio n. 1
0
 def test_stopwords_slovene(self):
     f = preprocess.StopwordsFilter('slovene')
     self.assertFalse(f._check('in'))
     self.assertTrue(f._check('abeceda'))
     self.corpus.metas[0, 0] = 'kača je v hiši'
     corpus = f(self.corpus)
     self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
Esempio n. 2
0
 def test_stopwords(self):
     f = preprocess.StopwordsFilter('english')
     self.assertFalse(f._check('a'))
     self.assertTrue(f._check('filter'))
     self.corpus.metas[0, 0] = 'a snake is in a house'
     corpus = f(self.corpus)
     self.assertListEqual(["snake", "house"], corpus.tokens[0])
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
Esempio n. 3
0
    def test_stopwords_slovene(self):
        f = preprocess.StopwordsFilter('slovene')

        self.assertFalse(f.check('in'))
        self.assertTrue(f.check('abeceda'))

        self.assertListEqual(["kača", "hiši"],
                             f(["kača", "je", "v", "hiši", "in"]))
Esempio n. 4
0
    def test_stopwords(self):
        f = preprocess.StopwordsFilter('english')

        self.assertFalse(f.check('a'))
        self.assertTrue(f.check('filter'))

        self.assertListEqual(["snake", "house"],
                             f(["a", "snake", "is", "in", "a", "house"]))
Esempio n. 5
0
    def test_str(self):
        f = preprocess.StopwordsFilter('french')
        self.assertIn('french', str(f).lower())

        f = preprocess.FrequencyFilter(keep_n=None)
        self.assertNotIn('none', str(f).lower())
        f.max_df = .5
        self.assertIn('0.5', str(f))
        f.max_df = .2
        self.assertIn('0.2', str(f))

        f = preprocess.LexiconFilter()
        self.assertIn('lexicon', str(f).lower())
Esempio n. 6
0
 def test_filter_pos_tags(self):
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer(),
         tag.AveragedPerceptronTagger(),
         preprocess.StopwordsFilter()
     ]
     corpus = self.corpus
     with corpus.unlocked():
         corpus.metas[0, 0] = "This is the most beautiful day in the world"
     for pp in pp_list:
         corpus = pp(corpus)
     self.assertEqual(len(corpus.tokens), len(corpus.pos_tags))
     self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0]))
     self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"])
     self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
Esempio n. 7
0
    def test_pickle_corpus(self):
        """
        Corpus must be picklable (for save data widget)
        gh-590
        """
        c = Corpus.from_file('book-excerpts')

        # it must also work with preprocessed corpus
        self.pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.WordPunctTokenizer(),
            preprocess.SnowballStemmer(),
            preprocess.FrequencyFilter(),
            preprocess.StopwordsFilter()
        ]
        for pp in self.pp_list:
            c = pp(c)
        pickle.dumps(c)
Esempio n. 8
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
Esempio n. 9
0
    def test_stopwords(self):
        filter = preprocess.StopwordsFilter('english')

        self.assertFalse(filter.check('a'))
        self.assertTrue(filter.check('filter'))