Beispiel #1
0
    def testInit(self):
        # create the transformation model by analyzing a corpus
        # uses the global `corpus`!
        model1 = tfidfmodel.TfidfModel(corpus)

        # make sure the dfs<->idfs transformation works
        self.assertEqual(model1.dfs, dictionary.dfs)
        self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dictionary.dfs, len(corpus)))

        # create the transformation model by directly supplying a term->docfreq
        # mapping from the global var `dictionary`.
        model2 = tfidfmodel.TfidfModel(dictionary=dictionary)
        self.assertEqual(model1.idfs, model2.idfs)
    def testInit(self):
        # create the transformation model by analyzing a corpus
        # uses the global `corpus`!
        model1 = tfidfmodel.TfidfModel(corpus)

        # make sure the dfs<->idfs transformation works
        self.assertEqual(model1.dfs, dictionary.dfs)
        self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dictionary.dfs, len(corpus)))

        # create the transformation model by directly supplying a term->docfreq
        # mapping from the global var `dictionary`.
        model2 = tfidfmodel.TfidfModel(dictionary=dictionary)
        self.assertEqual(model1.idfs, model2.idfs)
Beispiel #3
0
 def stem_model(self):
   print "stemming"
   new_id2word = corpora.Dictionary()
   # Create a new dicitonary with the stemmed terms and summed document frequencies
   for termid, freq in self.model.dfs.iteritems():
     stemmed_word = self.stemmer.stem(self.model.id2word[termid])
     stemmed_id = None
     if stemmed_word in new_id2word.token2id:
       stemmed_id = new_id2word.token2id[stemmed_word]
     else:
       stemmed_id = len(new_id2word.token2id)
       new_id2word.token2id[stemmed_word] = stemmed_id
       new_id2word.dfs[stemmed_id] = 0
     new_id2word.dfs[stemmed_id] += freq # add df from old dicionary
   new_id2word.num_docs = self.model.id2word.num_docs
   new_id2word.num_nnz = self.model.id2word.num_nnz
   new_id2word.num_pos = self.model.id2word.num_pos
   self.model.id2word = new_id2word
   self.model.dfs = self.model.id2word.dfs
   self.model.idfs = precompute_idfs(self.model.wglobal, self.model.dfs, self.model.num_docs)
   self.model.save('models/all_lancaster.tfidfmodel')
   print len(new_id2word)
   print "done stemming"