def test_inplace(self):
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w'))
        corpus = p(self.corpus, inplace=True)
        self.assertIs(corpus, self.corpus)

        corpus = p(self.corpus, inplace=False)
        self.assertIsNot(corpus, self.corpus)
        self.assertEqual(corpus, self.corpus)

        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'))
        corpus = p(self.corpus, inplace=False)
        self.assertIsNot(corpus, self.corpus)
        self.assertNotEqual(corpus, self.corpus)
Beispiel #2
0
 def test_preprocess(self):
     pr = preprocess.Preprocessor(
         tokenizer=preprocess.RegexpTokenizer('\w+'),
         pos_tagger=tag.AveragedPerceptronTagger())
     corpus = Corpus.from_file('deerwester')
     pr(corpus, inplace=True)
     self.assertIsNotNone(corpus.pos_tags)
 def test_reset_pos_tags(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     tagged_corpus = tagger(corpus)
     self.assertTrue(len(tagged_corpus.pos_tags))
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\w')
     tokenized_corpus = tokenizer(corpus)
     self.assertFalse(tokenized_corpus.pos_tags)
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     corpus = preprocess.RegexpTokenizer('\w+')(corpus)
     corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
    def test_max_df(self):
        ff = preprocess.FrequencyFilter(max_df=.3)
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                         filters=[ff])
        size = len(self.corpus.documents)

        corpus = p(self.corpus)
        self.assertFrequencyRange(corpus, 1, size * .3)

        ff.max_df = 2
        corpus = p(self.corpus)
        self.assertFrequencyRange(corpus, 1, 2)
    def test_min_df(self):
        ff = preprocess.FrequencyFilter(min_df=.5)
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                         filters=[ff])
        processed = p(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, size * .5, size)

        ff.min_df = 2
        processed = p(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, 2, size)
Beispiel #7
0
    def test_copy(self):
        corpus = Corpus.from_file('deerwester')

        p = preprocess.RegexpTokenizer('\w+\s}')
        copied = corpus.copy()
        copied = p(copied)
        self.assertIsNot(copied, corpus)
        self.assertNotEqual(copied, corpus)

        p(corpus)
        copied = corpus.copy()
        self.assertIsNot(copied, corpus)
Beispiel #8
0
    def test_copy(self):
        corpus = Corpus.from_file('deerwester')

        p = preprocess.Preprocessor(
            tokenizer=preprocess.RegexpTokenizer('\w+\s}'))
        copied = corpus.copy()
        p(copied, inplace=True)
        self.assertIsNot(copied, corpus)
        self.assertNotEqual(copied, corpus)

        p(corpus, inplace=True)
        copied = corpus.copy()
        self.assertIsNot(copied, corpus)
        self.assertEqual(copied, corpus)
Beispiel #9
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
Beispiel #10
0
 def test_empty_corpus(self):
     p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer(
         pattern='unmatchable'))
     empty = p(self.corpus)
     self.assertIsNone(self.model.fit(empty))
 def test_call_with_bad_input(self):
     tokenizer = preprocess.RegexpTokenizer(pattern='\w+')
     self.assertRaises(TypeError, tokenizer.tokenize, 1)
     self.assertRaises(TypeError, tokenizer.tokenize, ['1', 2])
 def test_can_pickle(self):
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\w')
     pickle.loads(pickle.dumps(tokenizer))
 def test_can_deepcopy(self):
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\w')
     copied = copy.deepcopy(tokenizer)
     corpus = Corpus.from_file('deerwester')
     self.assertTrue(all(tokenizer(corpus).tokens == copied(corpus).tokens))
 def test_skip_empty_strings(self):
     pattern = r'[^h ]*'
     tokenizer = preprocess.RegexpTokenizer(pattern=pattern)
     tokenizer.tokenizer = tokenizer.tokenizer_cls(pattern)
     tokens = tokenizer._preprocess('whatever')
     self.assertNotIn('', tokens)
 def test_str(self):
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\S+')
     self.assertEqual('Regexp', str(tokenizer))
 def test_call_with_bad_input(self):
     pattern = '\w+'
     tokenizer = preprocess.RegexpTokenizer(pattern=pattern)
     tokenizer.tokenizer = tokenizer.tokenizer_cls(pattern)
     self.assertRaises(TypeError, tokenizer._preprocess, 1)
     self.assertRaises(TypeError, tokenizer._preprocess, ['1', 2])
 def test_inplace(self):
     p = preprocess.RegexpTokenizer('\w')
     corpus = p(self.corpus)
     self.assertIsNot(corpus, self.corpus)
 def test_keep_n(self):
     ff = preprocess.FrequencyFilter(keep_n=5)
     p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                      filters=[ff])
     processed = p(self.corpus)
     self.assertEqual(len(set(itertools.chain(*processed.tokens))), 5)
 def test_on_change(self):
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\w+')
     tokenizer.on_change = counted(tokenizer.on_change)
     tokenizer.pattern = r'\S+'
     self.assertEqual(tokenizer.on_change.calls, 1)
     self.assertEqual(tokenizer.pattern, r'\S+')