def test_regex_filter(self): self.assertFalse(preprocess.RegexpFilter.validate_regexp('?')) self.assertTrue(preprocess.RegexpFilter.validate_regexp('\?')) reg_filter = preprocess.RegexpFilter(r'.') filtered = reg_filter(self.corpus) self.assertEqual(0, len(filtered.tokens[0])) self.assertEqual(len(filtered.used_preprocessor.preprocessors), 2) reg_filter = preprocess.RegexpFilter('foo') with self.corpus.unlocked(): self.corpus.metas[0, 0] = 'foo bar' filtered = reg_filter(self.corpus) self.assertEqual(filtered.tokens[0], ['bar']) self.assertEqual(len(filtered.used_preprocessor.preprocessors), 2) reg_filter = preprocess.RegexpFilter('^http') corpus = BASE_TOKENIZER(self.corpus) corpus._tokens[0] = ['https', 'http', ' http'] filtered = reg_filter(corpus) self.assertEqual(filtered.tokens[0], [' http']) self.assertEqual(len(filtered.used_preprocessor.preprocessors), 2)
def test_regex_filter(self): reg_filter = preprocess.RegexpFilter(r'.') filtered = reg_filter(self.corpus.tokens[0]) self.assertFalse(filtered) reg_filter.pattern = 'foo' self.assertCountEqual(reg_filter(['foo', 'bar']), ['bar']) reg_filter.pattern = '^http' self.assertCountEqual(reg_filter(['https', 'http', ' http']), [' http']) self.assertFalse(preprocess.RegexpFilter.validate_regexp('?')) self.assertTrue(preprocess.RegexpFilter.validate_regexp('\?'))
def pre_process(path): corpus = orangecontrib.text.Corpus.from_file(path) p = preprocess.Preprocessor( transformers=[ preprocess.LowercaseTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer() ], tokenizer=preprocess.RegexpTokenizer('\w+'), normalizer=preprocess.PorterStemmer(), filters=[ preprocess.StopwordsFilter('english'), preprocess.RegexpFilter( '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<' ) ]) return p(corpus)
def setUp(self): self.corpus = Corpus.from_file('deerwester') self.regexp = preprocess.RegexpFilter('foo')