def test_inplace(self): p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w')) corpus = p(self.corpus, inplace=True) self.assertIs(corpus, self.corpus) corpus = p(self.corpus, inplace=False) self.assertIsNot(corpus, self.corpus) self.assertEqual(corpus, self.corpus) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+')) corpus = p(self.corpus, inplace=False) self.assertIsNot(corpus, self.corpus) self.assertNotEqual(corpus, self.corpus)
def test_preprocess(self): pr = preprocess.Preprocessor( tokenizer=preprocess.RegexpTokenizer('\w+'), pos_tagger=tag.AveragedPerceptronTagger()) corpus = Corpus.from_file('deerwester') pr(corpus, inplace=True) self.assertIsNotNone(corpus.pos_tags)
def test_reset_pos_tags(self): corpus = Corpus.from_file('deerwester') tagger = tag.AveragedPerceptronTagger() tagged_corpus = tagger(corpus) self.assertTrue(len(tagged_corpus.pos_tags)) tokenizer = preprocess.RegexpTokenizer(pattern=r'\w') tokenized_corpus = tokenizer(corpus) self.assertFalse(tokenized_corpus.pos_tags)
def test_ngrams(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') corpus = preprocess.RegexpTokenizer('\w+')(corpus) corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus) result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertIn(corpus.tokens[0][1], attrs) self.assertIn(' '.join(corpus.tokens[0][:2]), attrs) self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
def test_max_df(self): ff = preprocess.FrequencyFilter(max_df=.3) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) size = len(self.corpus.documents) corpus = p(self.corpus) self.assertFrequencyRange(corpus, 1, size * .3) ff.max_df = 2 corpus = p(self.corpus) self.assertFrequencyRange(corpus, 1, 2)
def test_min_df(self): ff = preprocess.FrequencyFilter(min_df=.5) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) processed = p(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, size * .5, size) ff.min_df = 2 processed = p(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, 2, size)
def test_copy(self): corpus = Corpus.from_file('deerwester') p = preprocess.RegexpTokenizer('\w+\s}') copied = corpus.copy() copied = p(copied) self.assertIsNot(copied, corpus) self.assertNotEqual(copied, corpus) p(corpus) copied = corpus.copy() self.assertIsNot(copied, corpus)
def test_copy(self): corpus = Corpus.from_file('deerwester') p = preprocess.Preprocessor( tokenizer=preprocess.RegexpTokenizer('\w+\s}')) copied = corpus.copy() p(copied, inplace=True) self.assertIsNot(copied, corpus) self.assertNotEqual(copied, corpus) p(corpus, inplace=True) copied = corpus.copy() self.assertIsNot(copied, corpus) self.assertEqual(copied, corpus)
def pre_process(path): corpus = orangecontrib.text.Corpus.from_file(path) p = preprocess.Preprocessor( transformers=[ preprocess.LowercaseTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer() ], tokenizer=preprocess.RegexpTokenizer('\w+'), normalizer=preprocess.PorterStemmer(), filters=[ preprocess.StopwordsFilter('english'), preprocess.RegexpFilter( '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<' ) ]) return p(corpus)
def test_empty_corpus(self): p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer( pattern='unmatchable')) empty = p(self.corpus) self.assertIsNone(self.model.fit(empty))
def test_call_with_bad_input(self): tokenizer = preprocess.RegexpTokenizer(pattern='\w+') self.assertRaises(TypeError, tokenizer.tokenize, 1) self.assertRaises(TypeError, tokenizer.tokenize, ['1', 2])
def test_can_pickle(self): tokenizer = preprocess.RegexpTokenizer(pattern=r'\w') pickle.loads(pickle.dumps(tokenizer))
def test_can_deepcopy(self): tokenizer = preprocess.RegexpTokenizer(pattern=r'\w') copied = copy.deepcopy(tokenizer) corpus = Corpus.from_file('deerwester') self.assertTrue(all(tokenizer(corpus).tokens == copied(corpus).tokens))
def test_skip_empty_strings(self): pattern = r'[^h ]*' tokenizer = preprocess.RegexpTokenizer(pattern=pattern) tokenizer.tokenizer = tokenizer.tokenizer_cls(pattern) tokens = tokenizer._preprocess('whatever') self.assertNotIn('', tokens)
def test_str(self): tokenizer = preprocess.RegexpTokenizer(pattern=r'\S+') self.assertEqual('Regexp', str(tokenizer))
def test_call_with_bad_input(self): pattern = '\w+' tokenizer = preprocess.RegexpTokenizer(pattern=pattern) tokenizer.tokenizer = tokenizer.tokenizer_cls(pattern) self.assertRaises(TypeError, tokenizer._preprocess, 1) self.assertRaises(TypeError, tokenizer._preprocess, ['1', 2])
def test_inplace(self): p = preprocess.RegexpTokenizer('\w') corpus = p(self.corpus) self.assertIsNot(corpus, self.corpus)
def test_keep_n(self): ff = preprocess.FrequencyFilter(keep_n=5) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) processed = p(self.corpus) self.assertEqual(len(set(itertools.chain(*processed.tokens))), 5)
def test_on_change(self): tokenizer = preprocess.RegexpTokenizer(pattern=r'\w+') tokenizer.on_change = counted(tokenizer.on_change) tokenizer.pattern = r'\S+' self.assertEqual(tokenizer.on_change.calls, 1) self.assertEqual(tokenizer.pattern, r'\S+')