def get_tp(): # 获取16000篇文章的每篇文章概率主题分布,保存便于调用。 topic_list = [] diction = gensim.corpora.Dictionary.load("citeulike.dict") batch_lda = gensim.models.LdaModel.load("online_lda_100.lda") content = open(r"D:\citeulike\dic\temp.dat", "r").readlines() for i in content: text_list = stopword.get_txt(i) doc_bow = diction.doc2bow(text_list) doc_lda = batch_lda[doc_bow] topic_list.append(dict(doc_lda)) # 以字典的形式存储,方便获取 pickle.dump(topic_list, open(r"D:\citeulike\dic\all_onlinetopic_100.dump", "w"))
def get_tp(): # 获取16000篇文章的每篇文章概率主题分布,保存便于调用。 topic_list = [] diction = gensim.corpora.Dictionary.load('citeulike.dict') batch_lda = gensim.models.LdaModel.load('online_lda_100.lda') content = open(r'D:\citeulike\dic\temp.dat', 'r').readlines() for i in content: text_list = stopword.get_txt(i) doc_bow = diction.doc2bow(text_list) doc_lda = batch_lda[doc_bow] topic_list.append(dict(doc_lda)) # 以字典的形式存储,方便获取 pickle.dump(topic_list, open(r'D:\citeulike\dic\all_onlinetopic_100.dump', 'w'))
def get_topic(texts, lda): #获取输入文章的主题概率分布 dict_sim1 = {} sim = pickle.load( open(r'D:\citeulike\dic\topic_' + lda[:-4] + '.dump', 'r')) diction = gensim.corpora.Dictionary.load('citeulike.dict') batch_lda = gensim.models.LdaModel.load(r'D:\citeulike\dic' + '\\' + lda) text = input_article(texts) text_list = stopword.get_txt(text) doc_bow = diction.doc2bow(text_list) doc_lda = batch_lda[doc_bow] #print doc_lda #元组不可变,更改为列表 tp_batch = [] for yuanzu in doc_lda: tp_batch.append(list(yuanzu)) #归一化 sum0 = 0 t = 0 for m in range(len(doc_lda)): if tp_batch[m][1] > 0.1: t += 1 sum0 += tp_batch[m][1] for n in range(len(doc_lda)): if tp_batch[n][1] > 0.1: tp_batch[n][1] = tp_batch[n][1] / sum0 for (tp_id, rate) in tp_batch: if rate > 0.1: dict_sim2 = {} wordsNum = int(round((10 - t) * rate)) #print tp_id #print wordsNum #从相似性矩阵中获取wordnum个最相似的主题 sim_1 = sorted(sim[tp_id], reverse=True) sim_2 = sim_1[1:wordsNum + 1] #print sim_2 for i in sim_2: dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标 if tp_id in dict_sim2.keys(): dict_sim2[tp_id] += 1.0 else: dict_sim2[tp_id] = 1.0 #加入文章的主题,概率设为10 dict_sim1[tp_id] = dict_sim2 #最终返回字典形式,{文章主题i:{相似主题j:相似度,···},{},···} #print dict_sim1 return dict_sim1
def get_topic(texts,lda): #获取输入文章的主题概率分布 dict_sim1 = {} sim = pickle.load(open(r'D:\citeulike\dic\topic_'+lda[:-4]+'.dump','r')) diction = gensim.corpora.Dictionary.load('citeulike.dict') batch_lda = gensim.models.LdaModel.load(r'D:\citeulike\dic'+'\\'+lda) text = input_article(texts) text_list = stopword.get_txt(text) doc_bow = diction.doc2bow(text_list) doc_lda = batch_lda[doc_bow] #print doc_lda #元组不可变,更改为列表 tp_batch = [] for yuanzu in doc_lda: tp_batch.append(list(yuanzu)) #归一化 sum0 = 0 t = 0 for m in range(len(doc_lda)): if tp_batch[m][1] > 0.1: t += 1 sum0 += tp_batch[m][1] for n in range(len(doc_lda)): if tp_batch[n][1] > 0.1: tp_batch[n][1] = tp_batch[n][1]/sum0 for (tp_id,rate) in tp_batch: if rate >0.1: dict_sim2 = {} wordsNum = int(round((10-t)*rate)) #print tp_id #print wordsNum #从相似性矩阵中获取wordnum个最相似的主题 sim_1 = sorted(sim[tp_id],reverse=True) sim_2 = sim_1[1:wordsNum+1] #print sim_2 for i in sim_2: dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标 if tp_id in dict_sim2.keys(): dict_sim2[tp_id] += 1.0 else: dict_sim2[tp_id] = 1.0 #加入文章的主题,概率设为10 dict_sim1[tp_id] = dict_sim2 #最终返回字典形式,{文章主题i:{相似主题j:相似度,···},{},···} #print dict_sim1 return dict_sim1
def word(page): #计算推荐的文章page在给定主题topics下的单词概率和 topic_rate_new = 0 content_list = stopword.get_txt(txt[page]) doc_bow = diction.doc2bow(content_list) doc_lda = batch_lda[doc_bow] for j in range(len(doc_lda)): dic = {} word_rate = 0 tp_id = doc_lda[j][0] if tp_id in topics: word_list = batch_lda.show_topic(tp_id, 500000) dic = dict(word_list) # 将showtopic的[(a,b),(),()]形式转换成字典 for w in list(set(content_list)): # print dic[w] word_rate += dic[w] # 计算总的主题下单词概率 # 乘以该主题的概率并除以文章内单词总个数,防止有些文章词很多,值也就很大 topic_rate = doc_lda[j][1] * word_rate / len(set(content_list)) topic_rate_new += topic_rate #print topic_rate_new return topic_rate_new '''
def get_tp(nom): text_list = stopword.get_txt(content[nom]) doc_bow = diction.doc2bow(text_list) doc_lda = batch_lda[doc_bow] tp_batch = [] for yuanzu in doc_lda: tp_batch.append(list(yuanzu)) #归一化 dict_sim1 = {} sum0 = 0 t = 0 for m in range(len(doc_lda)): if tp_batch[m][1] > 0.1: t += 1 sum0 += tp_batch[m][1] for n in range(len(doc_lda)): if tp_batch[n][1] > 0.1: tp_batch[n][1] = tp_batch[n][1]/sum0 for (tp_id,rate) in tp_batch: if rate >0.1: dict_sim2 = {} wordsNum = int(round((10-t)*rate)) #print tp_id #print wordsNum #从相似性矩阵中获取wordnum个最相似的主题 sim_1 = sorted(sim[tp_id],reverse=True) sim_2 = sim_1[1:wordsNum+1] #print sim_2 for i in sim_2: dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标 ''' if tp_id in dict_sim2.keys(): dict_sim2[tp_id] += 1.0 else: dict_sim2[tp_id] = 1.0 #加入文章的主题,概率设为1 ''' dict_sim1[tp_id] = dict_sim2 tp_list = [] for i in dict_sim1.values(): for j in i.keys(): tp_list.append(j) tp_list2 = list(set(tp_list)) tp = pickle.load(open(r'D:\citeulike\dic\all_online_lda_100.dump', 'r')) #def to_article(rank): # 遍历每篇文章,通过排序后主题计算与该主题最相关的文章 rank_list = [] for m in range(len(tp)): rate2 = 0 t = 0 nov_rate = rank_topic.seen('1', m) # 乘以新奇性 for n in tp_list2: if n in tp[m].keys(): # 计算排序主题与主题概率的乘积,方便选取最相似的文章 rate = tp[m][n] t = t + 1 else: rate = 0 rate2 += rate if t != 0: # 乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大 sum_rate = rate2 * nov_rate / t else: sum_rate = rate2 * nov_rate rank_list.append(sum_rate) # 按文章顺序排列的相似性值 return rank_list