Ejemplo n.º 1
0
    try:
        pf = open(path, "r+")
        result = []  # 存放文档分布
        lines = pf.readlines()
        for line in lines:
            line = line.strip()
            nums = line.split(" ")
            result.append([float(i) for i in nums])
        pf.close()
        return result
    except IOError:
        print("文件无法正常打开")
        return None


if __name__ == "__main__":
    csv_filename = "data10.csv"

    doc = du.getDocAsWordArray(csv_filename)
    r_word_list, model = lda_model(doc, 10)
    doc_topic = list(model.doc_topic_)
    topic_word = list(model.topic_word_)
    writeResult(doc_topic, "10_lda_doc_topic.txt")
    """
    writeResult(doc_topic, "12_lda_doc_topic.txt")
    title = "LDA"
    print("正在降维")
    d2_data = dr.dimension_down(doc_topic)
    cp.paintModelPoint(d2_data, title)
    """
Ejemplo n.º 2
0
    # 将结果转为可进行评估的方式,未被聚类的设置为-1
    result = [-1] * whole_num
    for i in range(whole_num):
        for x in range(len(cluster_result)):
            if i in cluster_result[x]:
                result[i] = x
                break

    return result


if __name__ == "__main__":
    k = 12
    filename = "data1322.csv"
    # 4 diping 3  8,12 diping 5
    doc = du.getDocAsWordArray(filename, 5)

    cluster_result = clusterResult_qt(k, doc)

    former = du.getFormerCategory(filename)
    # print(former)

    # 将数据转为能直接进行准确度判定的形式
    c_r_result = []
    f_r = []
    for i in range(len(cluster_result)):
        if cluster_result[i] != -1:
            c_r_result.append(cluster_result[i])
            f_r.append(former[i])

    # print(len(c_r_result))
Ejemplo n.º 3
0
                dis_sum = sum(topic_distribution)
                for i in range(self.k):
                    topic_distribution[i] /= dis_sum
                for i in range(self.k):
                    doc_topic_dis[i] += topic_distribution[i] / bit_count
            distribution.append(doc_topic_dis)
        return distribution


# test---------------------------test
# 返回文档doc

if __name__ == "__main__":

    file_name = "data10.csv"
    document = du.getDocAsWordArray(file_name)
    doc_for_test = {}

    for name_simple in document:
        doc_for_test[name_simple] = document[name_simple]

    topic_num = 100

    start = time.perf_counter()
    model = BtmModel(topic_num, doc_for_test)
    model.buildModel()
    end = time.perf_counter()

    print("time used:{}".format(end - start))

    print("输出主题——词")
Ejemplo n.º 4
0
    # 获取lda模型和词袋
    print("创建主题模型")
    word_list, r_model = ldaa.lda_model(doc, k, iterator)

    # 获取文档——主题分布
    doc_topic = r_model.doc_topic_

    # 转为普通list进行聚类
    doc_topic_list = np.array(doc_topic).tolist()
    estimator = kmn.kMeansByFeature(topic, doc_topic_list)
    labels = estimator.labels_

    return list(labels)


if __name__ == "__main__":
    r_k = 8
    r_topic = 8

    file_name = "data8.csv"
    doc = du.getDocAsWordArray(file_name, 3)
    # 获取标签信息

    former = du.getFormerCategory(file_name)
    model = w2v.load_model_binary(r"E:\学校\快乐推荐\word2vec\api_saveVec")

    result = clusterResult_gibbs(r_k, r_topic, model, doc)
    result_list = cr.printResult(r_topic, result, former)
    print(result_list)