Esempio n. 1
0
 def setUp(self):
     self.corpus = Corpus.from_file("deerwester")
     self.pp_list = [preprocess.LowercaseTransformer(),
                     preprocess.WordPunctTokenizer(),
                     preprocess.SnowballStemmer(),
                     preprocess.NGrams(),
                     tag.AveragedPerceptronTagger()]
Esempio n. 2
0
 def test_preprocess(self):
     pr = preprocess.Preprocessor(
         tokenizer=preprocess.RegexpTokenizer('\w+'),
         pos_tagger=tag.AveragedPerceptronTagger())
     corpus = Corpus.from_file('deerwester')
     pr(corpus, inplace=True)
     self.assertIsNotNone(corpus.pos_tags)
Esempio n. 3
0
 def test_reset_pos_tags(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     tagged_corpus = tagger(corpus)
     self.assertTrue(len(tagged_corpus.pos_tags))
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\w')
     tokenized_corpus = tokenizer(corpus)
     self.assertFalse(tokenized_corpus.pos_tags)
Esempio n. 4
0
 def test_POSTagger(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     result = tagger.tag_corpus(corpus)
     self.assertTrue(hasattr(result, 'pos_tags'))
     # for token in itertools.chain(*result.tokens):
     #     self.assertRegexpMatches(token, '[a-z]+_[A-Z]+')
     for tokens, tags in zip(result.tokens, result.pos_tags):
         self.assertEqual(len(tokens), len(tags))
Esempio n. 5
0
 def test_pos_filter(self):
     pos_filter = preprocess.PosTagFilter("NN")
     pp_list = [
         preprocess.WordPunctTokenizer(),
         tag.AveragedPerceptronTagger()
     ]
     corpus = self.corpus
     for pp in pp_list:
         corpus = pp(corpus)
     filtered = pos_filter(corpus)
     self.assertTrue(len(filtered.pos_tags))
     self.assertEqual(len(filtered.pos_tags[0]), 5)
     self.assertEqual(len(filtered.tokens[0]), 5)
Esempio n. 6
0
 def test_filter_pos_tags(self):
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer(),
         tag.AveragedPerceptronTagger(),
         preprocess.StopwordsFilter()
     ]
     corpus = self.corpus
     with corpus.unlocked():
         corpus.metas[0, 0] = "This is the most beautiful day in the world"
     for pp in pp_list:
         corpus = pp(corpus)
     self.assertEqual(len(corpus.tokens), len(corpus.pos_tags))
     self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0]))
     self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"])
     self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
Esempio n. 7
0
 def setUp(self):
     self.tagger = tag.AveragedPerceptronTagger()
     self.corpus = Corpus.from_file('deerwester')