def to_article(rank, lda): # rank 排序后的主题列表 value = np.arange(1, 0, -0.1) # 主题对应的分值 np.arange(1,0,-0.1) tp = pickle.load(open(r"D:\citeulike\dic\all_" + lda[:-4] + ".dump", "r")) # 遍历每篇文章,通过排序后主题计算与该主题最相关的文章 rank_list = [] for m in range(len(tp)): rate2 = 0 t = 0 nov_rate = rank_topic.seen("1", m) # 乘以新奇性 for n in rank: if n in tp[m].keys(): j = rank.index(n) # 计算排序主题与主题概率的乘积,方便选取最相似的文章 rate = value[j] * tp[m][n] t = t + 1 else: rate = 0 rate2 += rate if t != 0: # 乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大 sum_rate = rate2 * nov_rate / t else: sum_rate = rate2 * nov_rate rank_list.append(sum_rate) # 按文章顺序排列的相似性值 return rank_list
def to_article(rank,lda): #rank 排序后的主题列表 value = np.arange(1,0,-0.1) # 主题对应的分值 np.arange(1,0,-0.1) tp = pickle.load(open(r'D:\citeulike\dic\all_'+lda[:-4]+'.dump', 'r')) # 遍历每篇文章,通过排序后主题计算与该主题最相关的文章 rank_list = [] for m in range(len(tp)): rate2 = 0 t = 0 nov_rate = rank_topic.seen('1', m) #乘以新奇性 for n in rank: if n in tp[m].keys(): j = rank.index(n) # 计算排序主题与主题概率的乘积,方便选取最相似的文章 rate = value[j] * tp[m][n] t = t +1 else: rate = 0 rate2 += rate if t != 0: #乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大 sum_rate = rate2*nov_rate / t else: sum_rate =rate2*nov_rate rank_list.append(sum_rate) # 按文章顺序排列的相似性值 return rank_list
def get_tp(nom): text_list = stopword.get_txt(content[nom]) doc_bow = diction.doc2bow(text_list) doc_lda = batch_lda[doc_bow] tp_batch = [] for yuanzu in doc_lda: tp_batch.append(list(yuanzu)) #归一化 dict_sim1 = {} sum0 = 0 t = 0 for m in range(len(doc_lda)): if tp_batch[m][1] > 0.1: t += 1 sum0 += tp_batch[m][1] for n in range(len(doc_lda)): if tp_batch[n][1] > 0.1: tp_batch[n][1] = tp_batch[n][1]/sum0 for (tp_id,rate) in tp_batch: if rate >0.1: dict_sim2 = {} wordsNum = int(round((10-t)*rate)) #print tp_id #print wordsNum #从相似性矩阵中获取wordnum个最相似的主题 sim_1 = sorted(sim[tp_id],reverse=True) sim_2 = sim_1[1:wordsNum+1] #print sim_2 for i in sim_2: dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标 ''' if tp_id in dict_sim2.keys(): dict_sim2[tp_id] += 1.0 else: dict_sim2[tp_id] = 1.0 #加入文章的主题,概率设为1 ''' dict_sim1[tp_id] = dict_sim2 tp_list = [] for i in dict_sim1.values(): for j in i.keys(): tp_list.append(j) tp_list2 = list(set(tp_list)) tp = pickle.load(open(r'D:\citeulike\dic\all_online_lda_100.dump', 'r')) #def to_article(rank): # 遍历每篇文章,通过排序后主题计算与该主题最相关的文章 rank_list = [] for m in range(len(tp)): rate2 = 0 t = 0 nov_rate = rank_topic.seen('1', m) # 乘以新奇性 for n in tp_list2: if n in tp[m].keys(): # 计算排序主题与主题概率的乘积,方便选取最相似的文章 rate = tp[m][n] t = t + 1 else: rate = 0 rate2 += rate if t != 0: # 乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大 sum_rate = rate2 * nov_rate / t else: sum_rate = rate2 * nov_rate rank_list.append(sum_rate) # 按文章顺序排列的相似性值 return rank_list