コード例 #1
0
 def setUp(self):
     self.corpus = Corpus.from_file("deerwester")
     self.pp_list = [preprocess.LowercaseTransformer(),
                     preprocess.WordPunctTokenizer(),
                     preprocess.SnowballStemmer(),
                     preprocess.NGrams(),
                     tag.AveragedPerceptronTagger()]
コード例 #2
0
 def test_call_with_tokens(self):
     corpus = preprocess.WordPunctTokenizer()(self.corpus)
     corpus = self.transformer(corpus)
     tokens = ['namuH', 'enihcam', 'ecafretni', 'rof', 'bal', 'cba',
               'retupmoc', 'snoitacilppa']
     self.assertEqual(corpus.tokens[0], tokens)
     self.assertTrue(corpus.has_tokens())
     text = 'Human machine interface for lab abc computer applications'
     self.assertEqual(corpus.documents[0], text)
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
コード例 #3
0
 def set_corpus(self, data=None):
     self.corpus = data
     # create preprocessed corpus upon setting data to avoid preprocessing
     # at each method run
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer()
     ]
     self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
     self.commit()
コード例 #4
0
 def test_pos_filter(self):
     pos_filter = preprocess.PosTagFilter("NN")
     pp_list = [
         preprocess.WordPunctTokenizer(),
         tag.AveragedPerceptronTagger()
     ]
     corpus = self.corpus
     for pp in pp_list:
         corpus = pp(corpus)
     filtered = pos_filter(corpus)
     self.assertTrue(len(filtered.pos_tags))
     self.assertEqual(len(filtered.pos_tags[0]), 5)
     self.assertEqual(len(filtered.tokens[0]), 5)
コード例 #5
0
 def set_corpus(self, data=None):
     self.corpus = data
     self.pp_corpus = None
     if self.corpus is not None:
         if not self.corpus.has_tokens():
             # create preprocessed corpus upon setting data to avoid
             # preprocessing at each method run
             pp_list = [
                 preprocess.LowercaseTransformer(),
                 preprocess.WordPunctTokenizer()
             ]
             self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
         else:
             self.pp_corpus = self.corpus
     self.commit.now()
コード例 #6
0
 def test_filter_pos_tags(self):
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer(),
         tag.AveragedPerceptronTagger(),
         preprocess.StopwordsFilter()
     ]
     corpus = self.corpus
     with corpus.unlocked():
         corpus.metas[0, 0] = "This is the most beautiful day in the world"
     for pp in pp_list:
         corpus = pp(corpus)
     self.assertEqual(len(corpus.tokens), len(corpus.pos_tags))
     self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0]))
     self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"])
     self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
コード例 #7
0
 def test_preprocessed(self):
     widget = self.create_widget(OWSentimentAnalysis)
     corpus = self.corpus.copy()
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer()
     ]
     for pp in pp_list:
         corpus = pp(corpus)
     self.send_signal(widget.Inputs.corpus, corpus)
     self.assertTrue(widget.pp_corpus)
     widget.liu_hu.click()
     simulate.combobox_activate_item(widget.liu_lang, "English")
     self.assertTrue(widget.pp_corpus)
     self.send_signal(widget.Inputs.corpus, None)
     self.assertIsNone(widget.pp_corpus)
コード例 #8
0
ファイル: test_corpus.py プロジェクト: Gr06/orange3-text
    def test_pickle_corpus(self):
        """
        Corpus must be picklable (for save data widget)
        gh-590
        """
        c = Corpus.from_file('book-excerpts')

        # it must also work with preprocessed corpus
        self.pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.WordPunctTokenizer(),
            preprocess.SnowballStemmer(),
            preprocess.FrequencyFilter(),
            preprocess.StopwordsFilter()
        ]
        for pp in self.pp_list:
            c = pp(c)
        pickle.dumps(c)