def test_stopwords_slovene(self): f = preprocess.StopwordsFilter('slovene') self.assertFalse(f._check('in')) self.assertTrue(f._check('abeceda')) self.corpus.metas[0, 0] = 'kača je v hiši' corpus = f(self.corpus) self.assertListEqual(["kača", "hiši"], corpus.tokens[0]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_stopwords(self): f = preprocess.StopwordsFilter('english') self.assertFalse(f._check('a')) self.assertTrue(f._check('filter')) self.corpus.metas[0, 0] = 'a snake is in a house' corpus = f(self.corpus) self.assertListEqual(["snake", "house"], corpus.tokens[0]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_stopwords_slovene(self): f = preprocess.StopwordsFilter('slovene') self.assertFalse(f.check('in')) self.assertTrue(f.check('abeceda')) self.assertListEqual(["kača", "hiši"], f(["kača", "je", "v", "hiši", "in"]))
def test_stopwords(self): f = preprocess.StopwordsFilter('english') self.assertFalse(f.check('a')) self.assertTrue(f.check('filter')) self.assertListEqual(["snake", "house"], f(["a", "snake", "is", "in", "a", "house"]))
def test_str(self): f = preprocess.StopwordsFilter('french') self.assertIn('french', str(f).lower()) f = preprocess.FrequencyFilter(keep_n=None) self.assertNotIn('none', str(f).lower()) f.max_df = .5 self.assertIn('0.5', str(f)) f.max_df = .2 self.assertIn('0.2', str(f)) f = preprocess.LexiconFilter() self.assertIn('lexicon', str(f).lower())
def test_filter_pos_tags(self): pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), tag.AveragedPerceptronTagger(), preprocess.StopwordsFilter() ] corpus = self.corpus with corpus.unlocked(): corpus.metas[0, 0] = "This is the most beautiful day in the world" for pp in pp_list: corpus = pp(corpus) self.assertEqual(len(corpus.tokens), len(corpus.pos_tags)) self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0])) self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"]) self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
def test_pickle_corpus(self): """ Corpus must be picklable (for save data widget) gh-590 """ c = Corpus.from_file('book-excerpts') # it must also work with preprocessed corpus self.pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.FrequencyFilter(), preprocess.StopwordsFilter() ] for pp in self.pp_list: c = pp(c) pickle.dumps(c)
def pre_process(path): corpus = orangecontrib.text.Corpus.from_file(path) p = preprocess.Preprocessor( transformers=[ preprocess.LowercaseTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer() ], tokenizer=preprocess.RegexpTokenizer('\w+'), normalizer=preprocess.PorterStemmer(), filters=[ preprocess.StopwordsFilter('english'), preprocess.RegexpFilter( '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<' ) ]) return p(corpus)
def test_stopwords(self): filter = preprocess.StopwordsFilter('english') self.assertFalse(filter.check('a')) self.assertTrue(filter.check('filter'))