def test_regex_filter(self):
        self.assertFalse(preprocess.RegexpFilter.validate_regexp('?'))
        self.assertTrue(preprocess.RegexpFilter.validate_regexp('\?'))

        reg_filter = preprocess.RegexpFilter(r'.')
        filtered = reg_filter(self.corpus)
        self.assertEqual(0, len(filtered.tokens[0]))
        self.assertEqual(len(filtered.used_preprocessor.preprocessors), 2)

        reg_filter = preprocess.RegexpFilter('foo')
        with self.corpus.unlocked():
            self.corpus.metas[0, 0] = 'foo bar'
        filtered = reg_filter(self.corpus)
        self.assertEqual(filtered.tokens[0], ['bar'])
        self.assertEqual(len(filtered.used_preprocessor.preprocessors), 2)

        reg_filter = preprocess.RegexpFilter('^http')
        corpus = BASE_TOKENIZER(self.corpus)
        corpus._tokens[0] = ['https', 'http', ' http']
        filtered = reg_filter(corpus)
        self.assertEqual(filtered.tokens[0], [' http'])
        self.assertEqual(len(filtered.used_preprocessor.preprocessors), 2)
    def test_regex_filter(self):
        reg_filter = preprocess.RegexpFilter(r'.')
        filtered = reg_filter(self.corpus.tokens[0])
        self.assertFalse(filtered)

        reg_filter.pattern = 'foo'
        self.assertCountEqual(reg_filter(['foo', 'bar']), ['bar'])

        reg_filter.pattern = '^http'
        self.assertCountEqual(reg_filter(['https', 'http', ' http']), [' http'])

        self.assertFalse(preprocess.RegexpFilter.validate_regexp('?'))
        self.assertTrue(preprocess.RegexpFilter.validate_regexp('\?'))
Exemple #3
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
 def setUp(self):
     self.corpus = Corpus.from_file('deerwester')
     self.regexp = preprocess.RegexpFilter('foo')