def testPersistence(self): model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(testfile()) model2 = tfidfmodel.TfidfModel.load(testfile()) self.assertTrue(model.idfs == model2.idfs) tstvec = [] self.assertTrue(numpy.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector
def testPersistenceCompressed(self): fname = testfile() + '.gz' model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def test_init(self): # create the transformation model by analyzing a corpus # uses the global `corpus`! model1 = tfidfmodel.TfidfModel(common_corpus) dfs = common_dictionary.dfs # make sure the dfs<->idfs transformation works self.assertEqual(model1.dfs, dfs) self.assertEqual( model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dfs, len(common_corpus))) # create the transformation model by directly supplying a term->docfreq # mapping from the global var `dictionary`. model2 = tfidfmodel.TfidfModel(dictionary=common_dictionary) self.assertEqual(model1.idfs, model2.idfs)
def testTransform(self): # create the transformation model model = tfidfmodel.TfidfModel(self.corpus, normalize=True) # transform one document doc = list(self.corpus)[0] transformed = model[doc] expected = [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] self.assertTrue(numpy.allclose(transformed, expected))
def analyse(self, texts): dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf_model = tfidfmodel.TfidfModel(corpus, normalize=True) tfidf_corpus = tfidf_model[corpus] lsi_model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=400) return Analysed(dictionary, tfidf_corpus, lsi_model)
def tfidfmodel(self, bowlist=None, save=False, savename=None): """ tf-idf是tf和idf两个参数的乘积,tf(词频)表示某个词在一句话中出现的频率,如果一个词在 一句话中多次出现,那么tf会是一个较大值 idf(倒文档率)表示包含某个词的语句在整个语料库中所占比率倒数之对数值,如果一个词广泛出现在语料中,那么它的idf值会较小 如果一个词在一句话中多次出现,但在整个语料中出现的不多,那么它一定对这句话很重要,所以它的tf-idf会较大 build a tfidfmodel :param bowlist: the list of bag of words :param save: save the model :param savename: the file name of model :return: the model itself """ print('using Tfidfmodel') tfidf = tfidfmodel.TfidfModel(bowlist) if save: print('输出TF-IDF模型到文件:{}'.format(savename)) tfidf.save(savename) return tfidf
def vectorize(ori_data, train_file_path, test_file_path): train_data = ori_data['train'] corpus = [] for label, data in train_data: corpus.append(data) dic = dictionary.Dictionary(corpus) corpus = [dic.doc2bow(doc) for doc in corpus] tfidf = tfidfmodel.TfidfModel(corpus) #training data fp = open(train_file_path, 'w') for label, data in train_data: vec = tfidf[dic.doc2bow(data)] fp.write("%s %s\n" % (label, sparse(vec))) fp.close() fp = open(test_file_path, 'w') for label, data in ori_data['test']: vec = tfidf[dic.doc2bow(data)] fp.write("%s %s\n" % (label, sparse(vec))) fp.close()
def _update_keywords(self): """ For each topic in the corpus, generate using TFIDF a list of n_keywords most importance keywords in the form of token-to-weight mapping :param n_keywords: number of keywords to store for each topic """ corpus_bow = [ self.dictionary.doc2bow(data['body']) for data in self.data.values() ] tfidf = tfidfmodel.TfidfModel(corpus_bow, smartirs=self.tfidf_scheme) for tid, data in self.data.items(): weights = tfidf[self.dictionary.doc2bow(data['body'])] weights.sort(key=lambda x: x[1], reverse=True) # generate token-to-weight mapping instead of id-to-weight mapping data['keywords'] = { self.dictionary[wid]: weight for wid, weight in weights[:self.num_keywords] }
def test_wlocal_wglobal(self): def wlocal(tf): assert isinstance(tf, np.ndarray) return iter(tf + 1) def wglobal(df, total_docs): return 1 docs = [corpus[1], corpus[2]] model = tfidfmodel.TfidfModel(corpus, wlocal=wlocal, wglobal=wglobal, normalize=False) transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [(termid, weight + 1) for termid, weight in docs[0]], [(termid, weight + 1) for termid, weight in docs[1]], ] self.assertTrue( np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0]))) self.assertTrue( np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
def testPersistence(self): model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(testfile()) model2 = tfidfmodel.TfidfModel.load(testfile()) self.assertTrue(model.idfs == model2.idfs)
def tfidf_calc(doc_corpus, num_features): tfidf_model = tfidfmodel.TfidfModel(doc_corpus) tfidf = tfidf_model[doc_corpus] index = docsim.Similarity('', tfidf, num_features=num_features) return tfidf, index[tfidf]
def sim_update(results): """ Update Models. :param results: :return: """ shutil.rmtree(lsitemp, ignore_errors=True) mkdir(lsitemp) t_total_begin = time.time() # print("Checking repeat ...") # results_temp = check_repet_new(results) results_temp = results # print("Check repeat complete!") print("Prefix mapping ...") results = prefix_map(results_temp) print("Prefix map complete!") del results_temp print("Building LSI model ...") # Extended Dictionary dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') # Load Models corpus_raw = corpora.MmCorpus(lsipath + 'viva.mm') lsi = lsimodel.LsiModel.load(lsipath + 'viva.lsi') # 将 mm 文件中的 corpus 映射到 LSI 空间当中 mkdir(news_post_add) # Preporcessing text. Get corpus_add. for postfile in results: deltags = stripTags(postfile['text']) text_del = delstopwords("".join(deltags.split())) # text_vec = jieba.lcut(text_del) # del and with open(news_post_add + postfile['name'], 'w') as fp: fp.write(text_del) files = os.listdir(news_post_add) for i in files: shutil.copy(news_post_add + i, docpath) from dict_stream_train import getDictionary dict2 = getDictionary(lsipath=lsitemp, docpath=news_post_add) dict2 = corpora.Dictionary.load(lsitemp + 'viva.dict') from corpus_stream_train import getCorpus corpus2 = getCorpus(lsipath=lsitemp, docpath=news_post_add) corpus2 = corpora.MmCorpus(lsitemp + 'viva.mm') dict2_to_dict1 = dictionary.merge_with(dict2) # dict2_to_dict1.save(lsipath + 'viva2.dict') # dict2_to_dict1 = corpora.Dictionary.load(lsipath + 'viva2.dict') merged_corpus = itertools.chain(corpus_raw, dict2_to_dict1[corpus2]) corpora.MmCorpus.serialize(lsipath + 'viva.mm', [i for i in merged_corpus]) merged_corpus = corpora.MmCorpus(lsipath + 'viva.mm') # Get TF-IDF vecters of documents tfidf = tfidfmodel.TfidfModel(merged_corpus) print("Building tfidf model ...") corpus_tfidf = tfidf[merged_corpus] print("Building corpus_tfidf model ...") # Updated LSI Model # lsi.add_documents(corpus_tfidf, chunksize=chunksize, decay=DECAY_FACTOR) # # lsi.add_documents(corpus_tfidf, chunksize=chunksize) # # print("Builded lsi add documents to model ...") # # Updated Corpus # if not os.path.exists(lsipath): # os.mkdir(lsipath) # # corpus = corpora.MmCorpus.serialize(lsipath + 'viva.mm', itertools.chain(corpus_raw, corpus2)) lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPIC, chunksize=chunksize, power_iters=2, onepass=True) # 其他参数都是默认 lsi.save(lsipath + 'viva.lsi') lsi = models.lsimodel.LsiModel.load(lsipath + 'viva.lsi') index = similarities.docsim.Similarity(lsipath + 'viva.index', lsi[merged_corpus], num_features=NUM_TOPIC) # Save Models index.save(lsipath + 'viva.index') print("LSI model saved!") # Print elasped time t2 = time.time() print "Total elapsed time is: ", t2 - t_total_begin, "s"
print globals()['__doc__'] % locals() sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = dmlcorpus.DmlConfig('gensim_%s' % language, resultDir = gensim_build.RESULT_DIR, acceptLangs = [language]) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = MmCorpus(config.resultFile('bow.mm')) if method == 'tfidf': model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) # then find the transformation from tf-idf to latent space model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) model.save(config.resultFile('model_lsi.pkl')) elif method == 'rp': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) # then find the transformation from tf-idf to latent space model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
def test_consistency(self): docs = [corpus[1], corpus[2]] # Test if `ntc` yields the default docs. model = tfidfmodel.TfidfModel(corpus, smartirs='nfc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(corpus) expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `wlocal` # tnn model = tfidfmodel.TfidfModel(corpus, smartirs='tnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # nnn model = tfidfmodel.TfidfModel(corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # lnn model = tfidfmodel.TfidfModel(corpus, smartirs='lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 2.0), (9, 1.0), (10, 1.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # dnn model = tfidfmodel.TfidfModel(corpus, smartirs='dnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 2.0), (9, 1.0), (10, 1.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # ann model = tfidfmodel.TfidfModel(corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 1.0), (9, 0.75), (10, 0.75)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # bnn model = tfidfmodel.TfidfModel(corpus, smartirs='bnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(5, 1), (9, 1), (10, 1)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Lnn model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526)]] # Testing all the variations of `glocal` # nxn model = tfidfmodel.TfidfModel(corpus, smartirs='nxn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # nfn model = tfidfmodel.TfidfModel(corpus, smartirs='nfn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 3.169925001442312), (4, 3.169925001442312), (5, 1.584962500721156), (6, 3.169925001442312), (7, 3.169925001442312), (8, 2.169925001442312)], [(5, 3.169925001442312), (9, 3.169925001442312), (10, 3.169925001442312)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # ntn model = tfidfmodel.TfidfModel(corpus, smartirs='ntn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 3.321928094887362), (4, 3.321928094887362), (5, 1.736965594166206), (6, 3.321928094887362), (7, 3.321928094887362), (8, 2.321928094887362)], [(5, 3.473931188332412), (9, 3.321928094887362), (10, 3.321928094887362)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # npn model = tfidfmodel.TfidfModel(corpus, smartirs='npn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0), (7, 3.0), (8, 1.8073549220576042)], [(5, 2.0), (9, 3.0), (10, 3.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `normalize` # nnx model = tfidfmodel.TfidfModel(corpus, smartirs='nnx') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # nnc model = tfidfmodel.TfidfModel(corpus, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905), (7, 0.4082482905), (8, 0.4082482905)], [(5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # nnu slope = 0.2 model = tfidfmodel.TfidfModel(corpus, smartirs='nnu', slope=slope) transformed_docs = [model[docs[0]], model[docs[1]]] average_unique_length = 1.0 * sum(len(set(text)) for text in texts) / len(texts) vector_norms = [ (1.0 - slope) * average_unique_length + slope * 6.0, (1.0 - slope) * average_unique_length + slope * 3.0, ] expected_docs = [ [(termid, weight / vector_norms[0]) for termid, weight in docs[0]], [(termid, weight / vector_norms[1]) for termid, weight in docs[1]], ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # nnb slope = 0.2 model = tfidfmodel.TfidfModel(dictionary=dictionary, smartirs='nnb', slope=slope) transformed_docs = [model[docs[0]], model[docs[1]]] average_character_length = sum( len(word) + 1.0 for text in texts for word in text) / len(texts) vector_norms = [ (1.0 - slope) * average_character_length + slope * 36.0, (1.0 - slope) * average_character_length + slope * 25.0, ] expected_docs = [ [(termid, weight / vector_norms[0]) for termid, weight in docs[0]], [(termid, weight / vector_norms[1]) for termid, weight in docs[1]], ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
from gensim.utils import to_unicode from gensim.interfaces import TransformedCorpus from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, ucicorpus, malletcorpus, textcorpus, indexedcorpus, dictionary) from gensim.models import (tfidfmodel,word2vec,ldamodel) print 'start' train_set=[] for line in open('articles.txt'): items = line.strip().split('\t', 1) if len(items) < 2: continue words = items[1].strip().split(' ') train_set.append(words) print 'construct dict' dic = dictionary.Dictionary(train_set) print 'doc2bow' corpus = [dic.doc2bow(text) for text in train_set] print 'ifidf' tfidf = tfidfmodel.TfidfModel(corpus) print 'ifidf corpus' corpus_tfidf = tfidf[corpus] print 'lda model' lda = ldamodel.LdaModel(corpus_tfidf, id2word = dic, num_topics = 1000, iterations = 1300, alpha = 0.15, eta = 0.01) print 'corpus_tfidf' corpus_lda = lda[corpus_tfidf] lda.save('lda_model')
def test_consistency(self): docs = [corpus[1], corpus[2]] # Test if `ntc` yields the default docs. model = tfidfmodel.TfidfModel(corpus, smartirs='ntc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(corpus) expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `wlocal` # nnn model = tfidfmodel.TfidfModel(corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # lnn model = tfidfmodel.TfidfModel(corpus, smartirs='lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 2.0), (9, 1.0), (10, 1.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # ann model = tfidfmodel.TfidfModel(corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 1.0), (9, 0.75), (10, 0.75)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # bnn model = tfidfmodel.TfidfModel(corpus, smartirs='bnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(5, 1), (9, 1), (10, 1)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Lnn model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `glocal` # ntn model = tfidfmodel.TfidfModel(corpus, smartirs='ntn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 3.169925001442312), (4, 3.169925001442312), (5, 1.584962500721156), (6, 3.169925001442312), (7, 3.169925001442312), (8, 2.169925001442312)], [(5, 3.169925001442312), (9, 3.169925001442312), (10, 3.169925001442312)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # npn model = tfidfmodel.TfidfModel(corpus, smartirs='npn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0), (7, 3.0), (8, 1.8073549220576042)], [(5, 2.0), (9, 3.0), (10, 3.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `normalize` # nnc model = tfidfmodel.TfidfModel(corpus, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905), (7, 0.4082482905), (8, 0.4082482905)], [(5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq < lower or docfreq > upper ] dictionary.filter_tokens( stop_ids) # remove stop words and words that appear only once dictionary.compactify( ) # remove gaps in id sequence after words that were removed print "taille du dictionnaire après diminution: %d" % len(dictionary) ## PROJECTION texts = [[word for word in doc.split(" ")] for doc in reviews] corpus = [dictionary.doc2bow(text) for text in texts] tfidf = tf.TfidfModel(corpus) tfidf_corpus = tfidf[corpus] vecteurs = mtutils.corpus2csc(tfidf_corpus, num_terms=len(dictionary), num_docs=len(labels)) vecteurs = vecteurs.T res = feat.chi2(vecteurs, labels) ids = [index for index in np.where(res == min(res[0]))[1]] dictionary.filter_tokens(ids) dictionary.compactify() print "taille du dictionnaire après Chi2 diminution: %d" % len(dictionary) ## PROJECTION texts = [[word for word in doc.split(" ")] for doc in reviews]
def getTfidfLsiSims(corpus, confId, confIdtoIndex, dictionary, outputDir): print( "Using gensim to get TFIDF vector and LSI vector for conferences in corpus " ) #tfidf tfidf = tfidfmodel.TfidfModel( corpus) # initialize a tfidf transformation for corpus corpus_tfidf = tfidf[corpus] # get tfidf vectors #lsi lsi = lsimodel.LsiModel( corpus_tfidf, id2word=dictionary, num_topics=4 ) # initialize an LSI transformation for corpus, with number of topics = 4 corpus_lsi = lsi[corpus_tfidf] ####### not important, just printing print("Printing TF-IDF vectors in " + outputDir + '/conffTFIDF.txt') fTFIDFFile = open(outputDir + '/conffTFIDF.txt', 'w') j = 0 for doc in corpus_tfidf: print >> fTFIDFFile, confId[j], doc j = j + 1 if j % 100 == 0: print(j) tfidf.save(outputDir + '/conftfidf.mod') #print "length of corpus is",len(corpus) printvectors = False if printvectors == True: i = 0 for doc in corpus_tfidf: print("tfidf doc", confId[i], doc) i += 1 i = 0 for doc in corpus_lsi: print("lsi doc", confId[i], doc) i += 1 ####### not important #compute similarity of corpus against itself listofMethods = ['corpus_lsi', 'corpus_tfidf'] for method in listofMethods: if method == 'corpus_lsi': cor = corpus_lsi elif method == 'corpus_tfidf': cor = corpus_tfidf index = similarities.MatrixSimilarity(cor) confSims = dict() confSimsDict = dict() # dictionary of [confId1][confId2] j = 0 sims = [] for vec_tfidf in cor: sims = index[vec_tfidf] sims = sorted(enumerate(sims), key=lambda item: -item[1]) confSims[confId[j]] = sims # in khat be dard nemikhore confSimsDict[j] = dict(sims) #print "index: ",confIdtoIndex[confId[j]], "confId: ", confId[j], confSims[confId[j]] j += 1 if method == 'corpus_lsi': cslsi = dict() for c1index in confSimsDict.keys(): cslsi[confId[c1index]] = dict() for c2index in confSimsDict.keys(): cslsi[confId[c1index]][ confId[c2index]] = confSimsDict[c1index][c2index] elif method == 'corpus_tfidf': cstfidf = dict() for c1index in confSimsDict.keys(): cstfidf[confId[c1index]] = dict() for c2index in confSimsDict.keys(): cstfidf[confId[c1index]][ confId[c2index]] = confSimsDict[c1index][c2index] return cstfidf, cslsi
def TestConsistency(self): docs = [corpus[1], corpus[2]] # Test if `ntc` yields the default docs. model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(self.corpus) expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `wlocal` # nnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], [(5, 6), (9, 3), (10, 3)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # lnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], [(5, 6.0), (9, 3.0), (10, 3.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # ann model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], [(5, 3.0), (9, 2.25), (10, 2.25)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # bnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], [(5, 3), (9, 3), (10, 3)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Lnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.4635792826230198), (4, 1.4635792826230198), (5, 2.19536892393453), (6, 1.4635792826230198), (7, 2.19536892393453), (8, 1.4635792826230198)], [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `glocal` # ntn model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)], [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # npn model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), (7, 1.0), (8, 1.8073549220576042)], [(5, 2.0), (9, 1.0), (10, 1.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `normalize` # nnc model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [[(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)], [(5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
def tfidf_bow(corpus_bow): tfidf = tfidfmodel.TfidfModel(corpus_bow) corpus_tfidf = tfidf[corpus_bow] return corpus_tfidf