Example #1
0
def clusterResult(doc={}, num=5, sim_num=3):
    # 此时doc为原文本(经过预处理分词)
    # 对文本进行处理,获取tf——idf,使用w2v进行扩容
    # 预计取前5个,扩容为3个
    # num 为 keyword数量
    # sim_num 为 扩容数
    model = w2v.load_model_binary(r"")
    doc = tfidf.expend_word(model, doc, num, sim_num)

    # 返回对应的聚类结果
    metrix = []
    ldaa.doc = doc
    # 获取lda模型和词袋
    print("创建主题模型")
    lda, corpus = ldaa.ldaAdapter(k1)
    for index, values in enumerate(lda.inference(corpus)[0]):
        topicmatrix = []
        # 对应文档的分布
        for topic, value in enumerate(values):
            topicmatrix.append(value)
        metrix.append(topicmatrix)

    # 将metrix送入kMeans进行聚类
    print("开始kMeans聚类")
    data = np.array(metrix)
    estimator = kmn.kMeansByFeature(k2, metrix)

    labels = list(estimator.labels_)
    return labels
def clusterResult(doc={}, num=3, save_path=r"", result_save=False):
    # 处理文档,返回聚类标签
    word_list, weight = tfidf.cal_tf_idf(doc)
    model = w2v.load_model_binary(save_path)
    print("开始生成vec_list")
    vec_list = w2v.wordlist_to_vec(model, word_list)  # 二维矩阵

    print("完成生成vec_list,计算doc_vec")
    # 每个文档进行处理,生成向量
    doc_vec = []
    t = 0
    for doc_weight in weight:
        if t % 10 == 0:
            print("turn", t)
        t += 1
        doc_vec_list = []
        for i in range(len(doc_weight)):
            mul = doc_weight[i]
            doc_vec_list.append([x * mul for x in vec_list[i]])

        simple_vec = np.array([0.0] * len(doc_vec_list[0]))
        for vec in doc_vec_list:
            # 使用np进行矩阵相加
            simple_vec += np.array(vec)
        simple_vec = list(simple_vec)
        doc_vec.append(simple_vec)

    if result_save:
        save_dv(doc_vec)

    # 进行聚类
    estimator = kmn.kMeansByFeature(num, doc_vec)
    labels = list(estimator.labels_)

    return labels
def clusterResultByFile(num=3):
    # 直接从文件中读取分布信息计算
    doc_vec = load_dv()

    estimator = kmn.kMeansByFeature(num, doc_vec)
    labels = list(estimator.labels_)

    return labels
Example #4
0
def lda_kmn_result(doc, iterator=1000):

    # 返回对应的聚类结果
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldaa.lda_model(doc, k1, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_

    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()
    estimator = kmn.kMeansByFeature(k2, doc_topic_list)
    labels = estimator.labels_

    return list(labels)
def lda_kmn_result(k, topic, doc, former, iterator=1000):

    # 返回对应的聚类结果
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldaa.lda_model(doc, topic, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_
    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()

    result = np.zeros(6)
    for i in range(30):
        estimator = kmn.kMeansByFeature(topic, doc_topic_list)
        labels = estimator.labels_
        # assign_labels="discretize",
        #labels = SC(assign_labels="discretize",gamma=1e-7, n_clusters=k).fit_predict(doc_topic)
        result += cr.printResult(k, labels, former)
    return result / 30
Example #6
0
def clusterResult_gibbs(k, topic, model, doc, num=5, sim_num=3, iterator=500):
    # 此时doc为原文本(经过预处理分词)
    # 对文本进行处理,获取tf——idf,使用w2v进行扩容
    # 预计取前5个,扩容为3个
    # num 为 keyword数量
    # sim_num 为 扩容数
    # model = w2v.load_model_binary(r"E:\学校\快乐推荐\word2vec\saveVec")
    print("拓展文档语料")
    doc = tfidf.expend_word(model, doc, num, sim_num)

    # 返回对应的聚类结果
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldaa.lda_model(doc, k, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_
    writeResult(doc_topic,"12_LDA+wiki.txt");
    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()
    estimator = kmn.kMeansByFeature(topic, doc_topic_list)
    labels = estimator.labels_

    return list(labels)
Example #7
0
def clusterResult_gibbs(model, doc={}, num=5, sim_num=3, iterator=500):
    # 此时doc为原文本(经过预处理分词)
    # 对文本进行处理,获取tf——idf,使用w2v进行扩容
    # 预计取前5个,扩容为3个
    # num 为 keyword数量
    # sim_num 为 扩容数
    # model = w2v.load_model_binary(r"")
    print("拓展文档语料")
    doc = tfidf.expend_word(model, doc, num, sim_num)

    # 返回对应的聚类结果
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldag.lda_model(doc, k1, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_

    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()
    estimator = kmn.kMeansByFeature(k2, doc_topic_list)
    labels = estimator.labels_

    return list(labels)
import cluster.cluster_result as cr
import data.data_util as du
import model.TF_IDFAdapter as tfidf
import numpy as np
from cluster import kmeans as kmn

if __name__ == "__main__":
    k = 4
    filename = "data4.csv"
    # 4 diping 3  8,12 diping 5
    doc = du.getDocAsWordArray(filename, 3)
    former = du.getFormerCategory(filename)

    word_dic, weight = tfidf.cal_tf_idf(doc)
    result = np.zeros(6)
    for i in range(30):
        estimator = kmn.kMeansByFeature(k, weight)
        labels = estimator.labels_
        result += cr.printResult(k, labels, former)
    result = result / 30

    print(result)
    print("开始模型训练")
    b_model = BtmVnModel(t_k,
                         t_doc_word_id,
                         t_voc_list,
                         t_word_sim_matrix,
                         iterate=t_iterate,
                         threshold=t_threshold,
                         miu=t_miu)
    b_model.buildModel()
    print("训练完成")

    dis = b_model.getDoc_Topic()
    print("完成文档主题文档获取")

    cluster_result = kmn.kMeansByFeature(t_k, dis).labels_
    former_type = du.getFormerCategory("C10.csv")
    result = wce.printResult(t_k, cluster_result, former_type)
    print(result)

    # t_word = b_model.getTopic_word(10)

    suffix = voc_filename[str(voc_filename).rindex("_") +
                          1:str(voc_filename).rindex(".")]
    dis_save = "{}_iterate{}_threshold{}_miu{}_result.txt".format(
        suffix, t_iterate, t_threshold, t_miu)
    tp_save = "{}_iterate{}_threshold{}_miu{}_tp.txt".format(
        suffix, t_iterate, t_threshold, t_miu)
    # write_Doc_Topic_Matrix(dis, dis_save)
    # write_Topic_word(b_model.topic_word, tp_save)