Ejemplo n.º 1
0
def get_tp():
    # 获取16000篇文章的每篇文章概率主题分布,保存便于调用。
    topic_list = []
    diction = gensim.corpora.Dictionary.load("citeulike.dict")
    batch_lda = gensim.models.LdaModel.load("online_lda_100.lda")
    content = open(r"D:\citeulike\dic\temp.dat", "r").readlines()
    for i in content:
        text_list = stopword.get_txt(i)
        doc_bow = diction.doc2bow(text_list)
        doc_lda = batch_lda[doc_bow]
        topic_list.append(dict(doc_lda))  # 以字典的形式存储,方便获取
    pickle.dump(topic_list, open(r"D:\citeulike\dic\all_onlinetopic_100.dump", "w"))
Ejemplo n.º 2
0
def get_tp():
    # 获取16000篇文章的每篇文章概率主题分布,保存便于调用。
    topic_list = []
    diction = gensim.corpora.Dictionary.load('citeulike.dict')
    batch_lda = gensim.models.LdaModel.load('online_lda_100.lda')
    content = open(r'D:\citeulike\dic\temp.dat', 'r').readlines()
    for i in content:
        text_list = stopword.get_txt(i)
        doc_bow = diction.doc2bow(text_list)
        doc_lda = batch_lda[doc_bow]
        topic_list.append(dict(doc_lda))  # 以字典的形式存储,方便获取
    pickle.dump(topic_list, open(r'D:\citeulike\dic\all_onlinetopic_100.dump', 'w'))
Ejemplo n.º 3
0
def get_topic(texts, lda):
    #获取输入文章的主题概率分布
    dict_sim1 = {}
    sim = pickle.load(
        open(r'D:\citeulike\dic\topic_' + lda[:-4] + '.dump', 'r'))
    diction = gensim.corpora.Dictionary.load('citeulike.dict')
    batch_lda = gensim.models.LdaModel.load(r'D:\citeulike\dic' + '\\' + lda)
    text = input_article(texts)
    text_list = stopword.get_txt(text)
    doc_bow = diction.doc2bow(text_list)
    doc_lda = batch_lda[doc_bow]
    #print doc_lda
    #元组不可变,更改为列表
    tp_batch = []
    for yuanzu in doc_lda:
        tp_batch.append(list(yuanzu))
    #归一化
    sum0 = 0
    t = 0
    for m in range(len(doc_lda)):
        if tp_batch[m][1] > 0.1:
            t += 1
            sum0 += tp_batch[m][1]
    for n in range(len(doc_lda)):
        if tp_batch[n][1] > 0.1:
            tp_batch[n][1] = tp_batch[n][1] / sum0
    for (tp_id, rate) in tp_batch:
        if rate > 0.1:
            dict_sim2 = {}
            wordsNum = int(round((10 - t) * rate))
            #print tp_id
            #print wordsNum
            #从相似性矩阵中获取wordnum个最相似的主题
            sim_1 = sorted(sim[tp_id], reverse=True)
            sim_2 = sim_1[1:wordsNum + 1]
            #print sim_2
            for i in sim_2:
                dict_sim2[sim[tp_id].index(i)] = i  #index获取列表中对应元素的下标
            if tp_id in dict_sim2.keys():
                dict_sim2[tp_id] += 1.0
            else:
                dict_sim2[tp_id] = 1.0  #加入文章的主题,概率设为10
            dict_sim1[tp_id] = dict_sim2
    #最终返回字典形式,{文章主题i:{相似主题j:相似度,···},{},···}
    #print dict_sim1
    return dict_sim1
Ejemplo n.º 4
0
def get_topic(texts,lda):
    #获取输入文章的主题概率分布
    dict_sim1 = {}
    sim = pickle.load(open(r'D:\citeulike\dic\topic_'+lda[:-4]+'.dump','r'))
    diction = gensim.corpora.Dictionary.load('citeulike.dict')
    batch_lda = gensim.models.LdaModel.load(r'D:\citeulike\dic'+'\\'+lda)
    text = input_article(texts)
    text_list = stopword.get_txt(text)
    doc_bow = diction.doc2bow(text_list)
    doc_lda = batch_lda[doc_bow]
    #print doc_lda
    #元组不可变,更改为列表
    tp_batch = []
    for yuanzu in doc_lda:
        tp_batch.append(list(yuanzu))
    #归一化
    sum0 = 0
    t = 0
    for m in range(len(doc_lda)):
        if tp_batch[m][1] > 0.1:
            t += 1
            sum0 += tp_batch[m][1]
    for n in range(len(doc_lda)):
        if tp_batch[n][1] > 0.1:
            tp_batch[n][1] = tp_batch[n][1]/sum0
    for (tp_id,rate) in tp_batch:
        if rate >0.1:
            dict_sim2 = {}
            wordsNum = int(round((10-t)*rate))
            #print tp_id
            #print wordsNum
            #从相似性矩阵中获取wordnum个最相似的主题
            sim_1 = sorted(sim[tp_id],reverse=True)
            sim_2 = sim_1[1:wordsNum+1]
            #print sim_2
            for i in sim_2:
                dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标
            if tp_id in dict_sim2.keys():
                dict_sim2[tp_id] += 1.0
            else:
                dict_sim2[tp_id] = 1.0  #加入文章的主题,概率设为10
            dict_sim1[tp_id] = dict_sim2    
    #最终返回字典形式,{文章主题i:{相似主题j:相似度,···},{},···}
    #print dict_sim1
    return dict_sim1
Ejemplo n.º 5
0
def word(page):
    #计算推荐的文章page在给定主题topics下的单词概率和
    topic_rate_new = 0
    content_list = stopword.get_txt(txt[page])
    doc_bow = diction.doc2bow(content_list)
    doc_lda = batch_lda[doc_bow]
    for j in range(len(doc_lda)):
        dic = {}
        word_rate = 0
        tp_id = doc_lda[j][0]
        if tp_id in topics:
            word_list = batch_lda.show_topic(tp_id, 500000)
            dic = dict(word_list)  # 将showtopic的[(a,b),(),()]形式转换成字典
            for w in list(set(content_list)):
                # print dic[w]
                word_rate += dic[w]  # 计算总的主题下单词概率
            # 乘以该主题的概率并除以文章内单词总个数,防止有些文章词很多,值也就很大
            topic_rate = doc_lda[j][1] * word_rate / len(set(content_list))
            topic_rate_new += topic_rate
    #print topic_rate_new
    return topic_rate_new
    '''
Ejemplo n.º 6
0
def word(page):
    #计算推荐的文章page在给定主题topics下的单词概率和
    topic_rate_new = 0
    content_list = stopword.get_txt(txt[page])
    doc_bow = diction.doc2bow(content_list)
    doc_lda = batch_lda[doc_bow]
    for j in range(len(doc_lda)):
        dic = {}
        word_rate = 0
        tp_id = doc_lda[j][0]
        if tp_id in topics:
            word_list = batch_lda.show_topic(tp_id, 500000)
            dic = dict(word_list)  # 将showtopic的[(a,b),(),()]形式转换成字典
            for w in list(set(content_list)):
                # print dic[w]
                word_rate += dic[w]  # 计算总的主题下单词概率
            # 乘以该主题的概率并除以文章内单词总个数,防止有些文章词很多,值也就很大
            topic_rate = doc_lda[j][1] * word_rate / len(set(content_list))
            topic_rate_new += topic_rate
    #print topic_rate_new
    return topic_rate_new
    '''
Ejemplo n.º 7
0
def get_tp(nom):
    text_list = stopword.get_txt(content[nom])
    doc_bow = diction.doc2bow(text_list)
    doc_lda = batch_lda[doc_bow]
    tp_batch = []
    for yuanzu in doc_lda:
        tp_batch.append(list(yuanzu))
#归一化
    dict_sim1 = {}
    sum0 = 0
    t = 0
    for m in range(len(doc_lda)):
        if tp_batch[m][1] > 0.1:
            t += 1
            sum0 += tp_batch[m][1]
    for n in range(len(doc_lda)):
        if tp_batch[n][1] > 0.1:
            tp_batch[n][1] = tp_batch[n][1]/sum0
    for (tp_id,rate) in tp_batch:
        if rate >0.1:
            dict_sim2 = {}
            wordsNum = int(round((10-t)*rate))
            #print tp_id
            #print wordsNum
            #从相似性矩阵中获取wordnum个最相似的主题
            sim_1 = sorted(sim[tp_id],reverse=True)
            sim_2 = sim_1[1:wordsNum+1]
            #print sim_2
            for i in sim_2:
                dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标
            '''
            if tp_id in dict_sim2.keys():
                dict_sim2[tp_id] += 1.0
            else:
                dict_sim2[tp_id] = 1.0  #加入文章的主题,概率设为1
            '''
            dict_sim1[tp_id] = dict_sim2
    tp_list = []
    for i in dict_sim1.values():
        for j in i.keys():
            tp_list.append(j)
    tp_list2 = list(set(tp_list))

    tp = pickle.load(open(r'D:\citeulike\dic\all_online_lda_100.dump', 'r'))
#def to_article(rank):
# 遍历每篇文章,通过排序后主题计算与该主题最相关的文章
    rank_list = []
    for m in range(len(tp)):
        rate2 = 0
        t = 0
        nov_rate = rank_topic.seen('1', m)  # 乘以新奇性
        for n in tp_list2:
            if n in tp[m].keys():
                # 计算排序主题与主题概率的乘积,方便选取最相似的文章
                rate = tp[m][n]
                t = t + 1
            else:
                rate = 0
            rate2 += rate
        if t != 0:
            # 乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大
            sum_rate = rate2 * nov_rate / t
        else:
            sum_rate = rate2 * nov_rate
        rank_list.append(sum_rate)  # 按文章顺序排列的相似性值
    return rank_list