def pre_process(path): corpus = orangecontrib.text.Corpus.from_file(path) p = preprocess.Preprocessor( transformers=[ preprocess.LowercaseTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer() ], tokenizer=preprocess.RegexpTokenizer('\w+'), normalizer=preprocess.PorterStemmer(), filters=[ preprocess.StopwordsFilter('english'), preprocess.RegexpFilter( '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<' ) ]) return p(corpus)
def test_preprocess_words(self): corpus = Corpus.from_file("book-excerpts") words = [ "House", "dóctor", "boy", "way", "Rum https://google.com", "https://google.com", "<p>abra<b>cadabra</b><p>", ] pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer(), ] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["house", "doctor", "boy", "way", "rum", "abracadabra"], _preprocess_words(corpus, words, dummy_callback), ) words = ["House", "dóctor", "boys", "way", "Rum"] pp_list = [preprocess.SnowballStemmer()] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["hous", "doctor", "boy", "way", "rum"], _preprocess_words(corpus, words, dummy_callback), )
def test_html(self): transformer = preprocess.HtmlTransformer() self.assertEqual(transformer._preprocess('<p>abra<b>cadabra</b><p>'), 'abracadabra')