def visualize(dir, dataset_id):
    # prefix_list = ['*', '*/information_retrieval', '*/information_retrieval/web_search']

    params = load_init_params(dataset_id)
    top_name = params['dataset_top_name']

    name = ""
    result_file = os.path.join(dir, 'result' + name + '.txt')
    main(top_name, result_file, dir + "/SpanningTree" + name + "-" + dir[-14:-6] + '-our-overall-3', min_level=0,
         max_level=2)
    main(top_name, result_file, dir + "/SpanningTree" + name + "-" + dir[-14:-6] + '-our-overall-4', min_level=0,
         max_level=3)
    main(top_name, result_file, dir + "/SpanningTree" + name + "-" + dir[-14:-6] + '-our-overall-5', min_level=0,
         max_level=10)
Exemple #2
0
def generate_tree(spanning_tree, dataset_id):
    """
    生成具有层次的树结构,前提是确定了根节点
    :param spanning_tree: 无向无环图
    :return: 层次结构列表
    """
    params = load_init_params(dataset_id)
    top = params['dataset_top_name']

    used = []
    top_tree = ["*/top"]
    temp = ["*/" + top]
    print("构建树开始")
    while len(temp) != 0:
        print(temp)
        print(top_tree)
        print("==========")
        new_temp = []
        for item in temp:
            item_list = item.split("/")
            leave = item_list[-1]
            if top == leave:
                item_list.remove(top)
            for tuple in spanning_tree:
                item_1, item_2, _, _, _ = tuple
                if item_1 in used or item_2 in used:
                    continue
                if leave == item_1:
                    new_edges = "/".join(item_list) + "/" + item_2
                    new_temp.append(new_edges)
                    top_tree.append(new_edges)
                    print(new_edges)
                elif leave == item_2:
                    new_edges = "/".join(item_list) + "/" + item_1
                    new_temp.append(new_edges)
                    top_tree.append(new_edges)
                    print(new_edges)
            used.append(leave)
        temp = new_temp

    return top_tree
Exemple #3
0
def generateTree(dataset, dataset_id):
    """

    :param dataset: 数据集名称
    :param dataset_id: 具体数据集领域名称
    :return: 无返回值,生成文件
    """

    params = load_init_params(dataset_id)
    min_kl = params['min_kl']

    data_path = os.path.join("./data/raw_data", dataset, dataset_id + "_0.csv")
    word_index_path = os.path.join("./data/processed_data", dataset,
                                   dataset_id + "_word_index.txt")
    index_dict_path = os.path.join("./data/processed_data", dataset,
                                   dataset_id + "_index_dict.txt")
    mi_matrix_path = os.path.join('./data/processed_data', dataset,
                                  dataset_id + '_mi_matrix.csv')
    kl_matrix_path = os.path.join('./data/processed_data', dataset,
                                  dataset_id + '_kl_matrix.csv')

    folder = create_dir()

    # 生成边信息和节点信息
    nodes, edges = generate_nodes_edges(word_index_path, index_dict_path,
                                        mi_matrix_path, kl_matrix_path)

    # 根据边和节点生成最大生成树
    spanning_tree = Kruskal(nodes, edges, data_path, min_kl)
    print(spanning_tree)

    # 确定根节点之后生成从根节点到叶节点的路径信息
    top_tree = generate_tree(spanning_tree, dataset_id)

    # 将路径信息写进文件夹中
    write_file(top_tree, folder)

    # 可视化生成树
    visualize(folder, dataset_id)
Exemple #4
0
    generateTree(dataset, dataset_domain)


if __name__ == '__main__':
    # params = load_init_params()
    # main(params)
    #
    # params = load_init_params("Beijing")
    # main(params)
    #
    # params = load_init_params("Guiyang")
    # main(params)
    #
    # params = load_init_params("Kunming")
    # main(params)
    #
    # params = load_init_params("Hangzhou")
    # main(params)
    #
    # params = load_init_params("nlp")
    # main(params)
    #
    # params = load_init_params("nlpcn")
    # main(params)
    #
    # params = load_init_params("ZhongGuoJinDaiShi")
    # main(params)

    params = load_init_params("g60763")
    main(params)
Exemple #5
0
        for word in self.word_index:
            self.word_set.add(word)
        self.PR = np.ones([len(set(self.word_set)), 1])
        for i in range(self.iternum):
            self.PR = (1 -
                       self.alpha) + self.alpha * np.dot(self.matrix, self.PR)

    # 输出词和相应的权重
    def printResult(self):
        word_pr = {}
        for i in range(len(self.PR)):
            word_pr[self.index_dict[str(i)]] = self.PR[i][0]
        res = sorted(word_pr.items(), key=lambda x: x[1], reverse=True)

        return res


if __name__ == '__main__':
    dataset_domain = "g60763"
    params = load_init_params(dataset_domain)
    dataset_domain = 'g60763'
    word_index_path = f".\\data\\processed_data\\{params['dataset']}\\{dataset_domain}_word_index.txt"
    index_dict_path = f".\\data\\processed_data\\{params['dataset']}\\{dataset_domain}_index_dict.txt"
    matrix_path = f".\\data\\processed_data\\{params['dataset']}\\{dataset_domain}_mi_matrix_norm.csv"
    tr = TextRank(word_index_path, index_dict_path, matrix_path, 3, 0.85,
                  700)  # 创建对象
    tr.createMatrix()
    tr.calPR()
    results = tr.printResult()
    print(results)
Exemple #6
0
def calcu_mi_kl(dataset, dataset_id):
    """
    计算实体之间的互信息量并且保存成供下一步最小生成树的生成
    :param dataset: 数据集名称:目前包含马蜂窝(mafengwo)以及知乎(zhihu)
    :param dataset_id: 具体数据集领域名称,目前包括:
                        马蜂窝:北京(Beijing),贵阳(Guiyang),杭州(Hangzhou),昆明(Kunming)
                        知乎:中文的自然语言处理(nlpcn),中英文的自然语言处理(nlp),中国近代史(ZhongGuoJinDaiShi)
    :return: 没有返回值,只写入四个文件
    """
    # Initialize the minimum value of tf value.
    params = load_init_params(dataset_id)
    min_enti_tf = params['min_enti_tf']
    min_feat_tf = params['min_feat_tf']

    data_path = os.path.join("./data/raw_data", dataset, dataset_id + "_0.csv")
    keywords_path = os.path.join("./data/raw_data", dataset,
                                 dataset_id + "_geo_noun.txt")
    feature_path = os.path.join("./data/raw_data", dataset,
                                dataset_id + "_non_geo_noun.txt")

    # 读取数据集信息
    print("读取数据集信息")
    sentences = createUsers(data_path)

    # 读取实体集合信息
    print("读取实体集合信息")
    keywords_set = createWordList(keywords_path)

    # 读取特征环境词集合信息
    print("读取特征环境词集合信息")
    feature_set = createWordList(feature_path)
    print(feature_set)

    # 统计关键词词频,删除低频关键词,更新关键词集合
    print("[关键词] 统计关键词词频,删除低频关键词,更新关键词集合")
    word_frequency, keywords_set = calcu_wordFrenq(sentences, keywords_set,
                                                   min_enti_tf)

    # 统计特征词词频,删除低频特征词,更新特征词集合
    print("[特征词] 统计特征词词频,删除低频特征词,更新特征词集合")
    _, feature_set = calcu_wordFrenq(sentences, feature_set, min_feat_tf)

    feature_set.update(keywords_set)

    # 创建共现次数矩阵
    print("创建 共现次数 矩阵")
    coocurrence_matrix, word_index, index_dict = create_coocurrence_matrix(
        sentences, keywords_set)

    # 计算实体互信息矩阵
    print("计算 实体互信息 矩阵")
    mi_matrix, mi_matrix_norm = create_mi_matrix(keywords_set, word_frequency,
                                                 coocurrence_matrix,
                                                 index_dict)

    # 创建实体-特征矩阵
    print("创建 实体-特征 矩阵")
    enti_feat_matrix = create_enti_feat_matrix(sentences, keywords_set,
                                               word_index, feature_set)

    # 计算实体 KL Divergence 矩阵
    kl_matrix, _, _ = create_kl_matrix(keywords_set, index_dict,
                                       enti_feat_matrix)

    # 保存文件
    print("保存 MI 文件")
    mi_matrix_path = os.path.join('./data/processed_data', dataset,
                                  dataset_id + '_mi_matrix.csv')
    mi_pd = pd.DataFrame(mi_matrix)
    mi_pd.to_csv(mi_matrix_path)

    mi_matrix_norm_path = os.path.join('./data/processed_data', dataset,
                                       dataset_id + '_mi_matrix_norm.csv')
    mi_pd = pd.DataFrame(mi_matrix_norm)
    mi_pd.to_csv(mi_matrix_norm_path)

    print("保存 entity-feature 文件")
    enti_feat_matrix_path = os.path.join(
        './data/processed_data', dataset,
        dataset_id + '_entity_feature_matrix.csv')
    enti_feat_pd = pd.DataFrame(enti_feat_matrix)
    enti_feat_pd.to_csv(enti_feat_matrix_path)

    print("保存 KL 散度矩阵 文件")
    kl_matrix_path = os.path.join('./data/processed_data', dataset,
                                  dataset_id + '_kl_matrix.csv')
    kl_pd = pd.DataFrame(kl_matrix)
    kl_pd.to_csv(kl_matrix_path)

    print("保存 索引 文件")
    word_index_path = os.path.join("./data/processed_data", dataset,
                                   dataset_id + "_word_index.txt")
    with open(word_index_path, "w", encoding="UTF-8") as file:
        file.writelines(json.dumps(word_index, ensure_ascii=False) + "\n")

    index_dict_path = os.path.join("./data/processed_data", dataset,
                                   dataset_id + "_index_dict.txt")
    with open(index_dict_path, "w", encoding="UTF-8") as file:
        file.writelines(json.dumps(index_dict, ensure_ascii=False) + "\n")
Exemple #7
0
    for i in range(n):
        if i == alter_id:
            continue
        if mi_matrix[topic_id][i] > 0:
            neighbor.add(i)
            # print(index_dict[str(i)], end=",")
            topic_pd = topic_pd + enti_feat_matrix[i]
    # print()

    return topic_pd


dataset = "mafengwo"
dataset_id = "Beijing"

params = load_init_params(dataset_id)
min_kl = params['min_kl']

word_index_path = os.path.join("./data", dataset,
                               dataset_id + "_word_index.txt")
index_dict_path = os.path.join("./data", dataset,
                               dataset_id + "_index_dict.txt")
mi_matrix_path = os.path.join('./data', dataset, dataset_id + '_mi_matrix.csv')
kl_matrix_path = os.path.join('./data', dataset, dataset_id + '_kl_matrix.csv')
entity_feature_matrix_path = os.path.join(
    './data', dataset, dataset_id + '_entity_feature_matrix.csv')

# 生成边信息和节点信息
nodes, edges = generate_nodes_edges(word_index_path, index_dict_path,
                                    mi_matrix_path, kl_matrix_path)