Beispiel #1
0
def clusterResult(doc={}, num=5, sim_num=3):
    # 此时doc为原文本(经过预处理分词)
    # 对文本进行处理,获取tf——idf,使用w2v进行扩容
    # 预计取前5个,扩容为3个
    # num 为 keyword数量
    # sim_num 为 扩容数
    model = w2v.load_model_binary(r"")
    doc = tfidf.expend_word(model, doc, num, sim_num)

    # 返回对应的聚类结果
    metrix = []
    ldaa.doc = doc
    # 获取lda模型和词袋
    print("创建主题模型")
    lda, corpus = ldaa.ldaAdapter(k1)
    for index, values in enumerate(lda.inference(corpus)[0]):
        topicmatrix = []
        # 对应文档的分布
        for topic, value in enumerate(values):
            topicmatrix.append(value)
        metrix.append(topicmatrix)

    # 将metrix送入kMeans进行聚类
    print("开始kMeans聚类")
    data = np.array(metrix)
    estimator = kmn.kMeansByFeature(k2, metrix)

    labels = list(estimator.labels_)
    return labels
def clusterResult(doc={}, num=3, save_path=r"", result_save=False):
    # 处理文档,返回聚类标签
    word_list, weight = tfidf.cal_tf_idf(doc)
    model = w2v.load_model_binary(save_path)
    print("开始生成vec_list")
    vec_list = w2v.wordlist_to_vec(model, word_list)  # 二维矩阵

    print("完成生成vec_list,计算doc_vec")
    # 每个文档进行处理,生成向量
    doc_vec = []
    t = 0
    for doc_weight in weight:
        if t % 10 == 0:
            print("turn", t)
        t += 1
        doc_vec_list = []
        for i in range(len(doc_weight)):
            mul = doc_weight[i]
            doc_vec_list.append([x * mul for x in vec_list[i]])

        simple_vec = np.array([0.0] * len(doc_vec_list[0]))
        for vec in doc_vec_list:
            # 使用np进行矩阵相加
            simple_vec += np.array(vec)
        simple_vec = list(simple_vec)
        doc_vec.append(simple_vec)

    if result_save:
        save_dv(doc_vec)

    # 进行聚类
    estimator = kmn.kMeansByFeature(num, doc_vec)
    labels = list(estimator.labels_)

    return labels
def calWordSimilarity(voc_filename, model_path):
    # 利用w2v和词汇表文件计算相似度矩阵,估计非常慢
    voc_path = os.path.abspath(rootPath + "resource\\" + voc_filename)
    voc_list = []
    # 读取文档分布矩阵
    try:
        pf = open(voc_path, "r+")
        lines = pf.readlines()
        for line in lines:
            voc_list.append(line.strip())
    except IOError:
        print("词汇表读取失败")
    print("词汇表读取完成")

    # 根据描述需要生成一个词相似度的矩阵
    # 计划直接使用word2vec
    print("加载word2vec模型")
    model = w2v.load_model_binary(save_path=model_path)
    print("计算词汇相似度")
    sim_matrix = w2v.word_sim_matrix(model, voc_list)
    print("词汇相似度矩阵计算完成")

    # 输出文件
    sim_save_filename = "word_sim_{}.txt"\
        .format(voc_filename[str(voc_filename).rindex("_") + 1:str(voc_filename).rindex(".")])
    save_path = os.path.abspath(rootPath + "resource\\" + sim_save_filename)
    print(save_path)

    try:
        file = open(save_path, "w+", encoding="utf-8")
        for sim in sim_matrix:
            line = " ".join([str(x) for x in sim])
            file.writelines(line + "\n")
            print(line)
        file.close()
    except IOError:
        print("文件存放失败")

    print("词汇相似度矩阵文件生成")
    return sim_matrix
Beispiel #4
0
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldaa.lda_model(doc, k, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_
    writeResult(doc_topic,"12_LDA+wiki.txt");
    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()
    estimator = kmn.kMeansByFeature(topic, doc_topic_list)
    labels = estimator.labels_

    return list(labels)


if __name__ == "__main__":
    r_k = 12
    r_topic = 12

    file_name = "data12.csv"
    doc = du.getDocAsWordArray(file_name, 5)
    # 获取标签信息

    former = du.getFormerCategory(file_name)
    model = w2v.load_model_binary(r"E:\学校\快乐推荐\word2vec\saveVec")

    result = clusterResult_gibbs(r_k, r_topic, model, doc)
    result_list = cr.printResult(r_topic, result, former)
    print(result_list)

Beispiel #5
0
    print("开始拓展")
    for i in range(len(weight)):
        # 将id,频率fre,weight,zip起来sort
        w = weight[i]
        fre = frequency_array[i]
        id_fre_w = list(zip(range(len(weight[i])), fre, w))
        id_fre_w.sort(key=sort_method, reverse=True)  # 倒序排序
        t = 0
        for j in range(len(id_fre_w)):
            if t < num:
                if id_fre_w[j][1] != 0:
                    ex_list = w2v.word_expand(model, word_dic[id_fre_w[j][0]], sim_num)
                    for e in ex_list:
                        doc[name[i]] += (" "+e)
                    t += 1
            else:
                break
    return doc


if __name__ == "__main__":
    doc = {"1": "this is the first document",
           "2": "this is the second second document",
           "3": "and the third one",
           "4": "is this the first document",
           "5": "no of course"}
    # new_d = keyword_by_tf_idf(doc)
    model = w2v.load_model_binary(r"")
    doc = expend_word(model, doc, 5, 3)
    print(doc)
def simple_lda_result(k, filename, iterator=30):
    # 获取数据
    doc = du.getDocAsWordArray(filename, 5)
    # 获取标签信息
    former = du.getFormerCategory(filename)

    c_result = s_lda.lda_result(k, doc, iterator)
    result_list = printResult(k, c_result, former)
    print(result_list)
    return c_result, result_list


if __name__ == "__main__":
    topic = 10
    kkt = 10
    file_name = "C10.csv"

    iterate = 10
    key_num = 5
    sim_num = 3

    save = r""
    model = w2v.load_model_binary(save)
    cluster_result, result = tf_idf_expand_lda_gibbs(topic,
                                                     kkt,
                                                     file_name,
                                                     key_num,
                                                     sim_num,
                                                     model,
                                                     iterator=500)
    print(result)