def testInit(self): # create the transformation model by analyzing a corpus # uses the global `corpus`! model1 = tfidfmodel.TfidfModel(corpus) # make sure the dfs<->idfs transformation works self.assertEqual(model1.dfs, dictionary.dfs) self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dictionary.dfs, len(corpus))) # create the transformation model by directly supplying a term->docfreq # mapping from the global var `dictionary`. model2 = tfidfmodel.TfidfModel(dictionary=dictionary) self.assertEqual(model1.idfs, model2.idfs)
def stem_model(self): print "stemming" new_id2word = corpora.Dictionary() # Create a new dicitonary with the stemmed terms and summed document frequencies for termid, freq in self.model.dfs.iteritems(): stemmed_word = self.stemmer.stem(self.model.id2word[termid]) stemmed_id = None if stemmed_word in new_id2word.token2id: stemmed_id = new_id2word.token2id[stemmed_word] else: stemmed_id = len(new_id2word.token2id) new_id2word.token2id[stemmed_word] = stemmed_id new_id2word.dfs[stemmed_id] = 0 new_id2word.dfs[stemmed_id] += freq # add df from old dicionary new_id2word.num_docs = self.model.id2word.num_docs new_id2word.num_nnz = self.model.id2word.num_nnz new_id2word.num_pos = self.model.id2word.num_pos self.model.id2word = new_id2word self.model.dfs = self.model.id2word.dfs self.model.idfs = precompute_idfs(self.model.wglobal, self.model.dfs, self.model.num_docs) self.model.save('models/all_lancaster.tfidfmodel') print len(new_id2word) print "done stemming"