Python get_testset_path Exemples, utils.get_testset_path Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : auto_catalog_main_overlap.py Projet : TsingH-googl/PNR

def auto_overlap(prex=None,
                 graph_name=None,
                 emb_method_name1=None,
                 emb_method_name2=None,
                 binNum=None):
    time_start = time.time()
    print('----------------------------------------------------------')
    print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 +
          "," + emb_method_name2)

    results_base_dir = 'D:\hybridrec//results//'
    all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex
    results_dir = 'D:\hybridrec/results//' + prex
    graph_results_dir = results_dir + graph_name + '//'

    path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat"
    path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat"

    if not (os.path.exists(path_scores_method1)
            and os.path.exists(path_scores_method2)):
        print("dataset: " + graph_name + '----' + "baselines:" +
              emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算')

    if os.path.exists(path_scores_method1) and os.path.exists(
            path_scores_method2):
        # 获取归一化分数
        scores_matrix_one_dict = (loadmat(path_scores_method1))
        scores_matrix_two_dict = (loadmat(path_scores_method2))
        scores_matrix_one = scores_matrix_one_dict['scores']
        scores_matrix_two = scores_matrix_two_dict['scores']
        if emb_method_name1 not in all_embedding_methods:
            scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A,
                                                   k=1))  # k=1表示不包括对角线
        if emb_method_name2 not in all_embedding_methods:
            scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1))
        scores_matrix_one_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_one))  # 去掉传参的csr_matrix()则会
        scores_matrix_two_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_two))

        # 获取train_binary和test_binary
        graph_train_path = get_trainset_path(base_dir=all_file_dir,
                                             graph_name=graph_name,
                                             connected_pattern='undirected',
                                             from_zeros_one='0')
        graph_test_path = get_testset_path(base_dir=all_file_dir,
                                           graph_name=graph_name)
        G = read_graph(weighted=0, input=graph_train_path, directed=0)
        train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
        train_binary = csr_matrix(np.triu(train_binary.A, k=1))
        test_binary = get_test_matrix_binary(graph_test_path=graph_test_path,
                                             N=train_binary.shape[0])

        # 读取plus的原始分数（未归一化）
        plus_scores_name = 'plus_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat'
        plus_scores_path = graph_results_dir + plus_scores_name
        scores_matrix_plus_dict = (loadmat(plus_scores_path))
        scores_matrix_plus = scores_matrix_plus_dict['scores']

        # 读取multiply的原始分数（未归一化）
        multiply_scores_name = 'multiply_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat'
        multiply_scores_path = graph_results_dir + multiply_scores_name
        scores_matrix_multiply_dict = (loadmat(multiply_scores_path))
        scores_matrix_multiply = scores_matrix_multiply_dict['scores']

        # 读取MLP的原始分数（未归一化）
        mlp_scores_name = 'mlp_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat'
        mlp_scores_path = graph_results_dir + mlp_scores_name
        scores_matrix_mlp_dict = (loadmat(mlp_scores_path))
        scores_matrix_mlp = scores_matrix_mlp_dict['scores']

        # 归一化hybrid分数
        scores_matrix_plus_norm = normalize_matrix(
            csr_matrix1=scores_matrix_plus)
        scores_matrix_multiply_norm = normalize_matrix(
            csr_matrix1=scores_matrix_multiply)
        scores_matrix_mlp_norm = normalize_matrix(
            csr_matrix1=scores_matrix_mlp)

        # 计算plus、multiply、mlp、PNR的rasterization grids
        mlp_path = results_base_dir + prex + graph_name + "//" + "mlp_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        mlp_dict = (loadmat(mlp_path))
        mlp_raster_grids = mlp_dict["count"]
        multiply_path = results_base_dir + prex + graph_name + "//" + "multiply_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        multiply_dict = (loadmat(multiply_path))
        multiply_raster_grids = multiply_dict["count"]
        plus_path = results_base_dir + prex + graph_name + "//" + "plus_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        plus_dict = (loadmat(plus_path))
        plus_raster_grids = plus_dict["count"]

        # plus_raster_grids = rasterization_grids(binNum=binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_plus_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        # multiply_raster_grids = rasterization_grids(binNum=binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_multiply_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        # mlp_raster_grids = rasterization_grids(binNum=binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_mlp_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        PNR_path = results_base_dir + prex + graph_name + "//" + "PNR2_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        PNR_dict = (loadmat(PNR_path))
        PNR_raster_grids = PNR_dict["count"]

        exist_binary = csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
        nonexist_binary = csr_matrix(
            np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)
        # 获取plus的nonexist_scores_list
        nonexist_scores_plus_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=plus_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)
        # 获取multiply的nonexist_scores_list
        nonexist_scores_multiply_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=multiply_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)
        # 获取mlp的nonexist_scores_list
        nonexist_scores_mlp_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=mlp_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)
        # 获取PNR的nonexist_scores_list
        nonexist_scores_PNR_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=PNR_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)

        # 获取阈值
        E_test = np.sum(test_binary.A)
        thresold_plus = get_list_thresold(nonexist_scores_plus_list, L=E_test)
        thresold_multiply = get_list_thresold(nonexist_scores_multiply_list,
                                              L=E_test)
        thresold_mlp = get_list_thresold(nonexist_scores_mlp_list, L=E_test)
        thresold_PNR = get_list_thresold(nonexist_scores_PNR_list, L=E_test)

        # 这里的trick, L=1/2 |E_test|!!!!!!!!!!!
        # thresold_plus = int(thresold_plus*0.5)
        # thresold_multiply = int(thresold_multiply * 0.5)
        # thresold_mlp = int(thresold_mlp * 0.5)
        # thresold_PNR = int(thresold_PNR * 0.5)

        # 修改grids
        plus_raster_grids = plus_raster_grids.A
        multiply_raster_grids = multiply_raster_grids.A
        mlp_raster_grids = mlp_raster_grids.A
        PNR_raster_grids = PNR_raster_grids.A
        # np.where(plus_raster_grids > thresold_plus, plus_raster_grids, 0)
        # np.where(multiply_raster_grids > thresold_multiply, multiply_raster_grids, 0)
        # np.where(mlp_raster_grids > thresold_mlp, mlp_raster_grids, 0)
        # np.where(PNR_raster_grids > thresold_PNR, PNR_raster_grids, 0)
        plus_raster_grids[plus_raster_grids <= thresold_plus] = 0.0
        multiply_raster_grids[multiply_raster_grids <= thresold_multiply] = 0.0
        mlp_raster_grids[mlp_raster_grids <= thresold_mlp] = 0.0
        PNR_raster_grids[PNR_raster_grids <= thresold_PNR] = 0.0

        plus_raster_grids[plus_raster_grids >= thresold_plus] = 1.0
        multiply_raster_grids[multiply_raster_grids >= thresold_multiply] = 1.0
        mlp_raster_grids[mlp_raster_grids >= thresold_mlp] = 1.0
        PNR_raster_grids[PNR_raster_grids >= thresold_PNR] = 1.0

        # 画图
        # colors = ['OrangeRed', 'darkseagreen', 'dodgerblue', 'blueviolet']
        colors = ['Red', 'green', 'blue', 'purple']
        result = np.float32(PNR_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5，标准差取0
        title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[0])

        result = np.float32(plus_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5，标准差取0
        title = graph_name + '-plus-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[1])

        result = np.float32(multiply_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5，标准差取0
        title = graph_name + '-multiply-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[2])

        result = np.float32(mlp_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5，标准差取0
        title = graph_name + '-mlp-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[3])

        # # 计算plus的rasterization grids
        # plus_raster_grids = rasterization_grids(binNum=plus_binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_plus_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        # # plus_raster_grids = np.log10(plus_raster_grids) # 出现-inf而报错
        # plus_raster_grids = normalize_matrix_full(csr_matrix1=csr_matrix(plus_raster_grids))
        # plus_raster_grids = better_show_grids(csr_matrix1=plus_raster_grids)
        #
        # source = np.float32(plus_raster_grids.A)
        # result = cv2.GaussianBlur(source, (5, 5), 0)
        # title = graph_name + '-' + 'plus' +'-' + emb_method_name1 + '-' + emb_method_name2
        # plot_contourf(result=result, title=title, binNum=10)
        #

        time_end = time.time()
        print("It takes : " + str((time_end - time_start) / 60.0) + "  mins.")
        pass

Exemple #2

0

Afficher le fichier

def auto_DNN(prex=None,
             graph_name=None,
             emb_method_name1=None,
             emb_method_name2=None,
             model_name=None,
             DNN_binNum=None):
    print('----------------------------------------------------------')
    print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 +
          "," + emb_method_name2)

    results_base_dir = 'D:\hybridrec//results//'
    all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex
    results_dir = 'D:\hybridrec/results//' + prex
    graph_results_dir = results_dir + graph_name + '//'
    # （facebook_combined的规律：ratio越小则正负样本的预测准确率越高，花的时间也越少）
    ratio = 1  # 负样本的总数是正样 本的ratio倍  # 改这里

    path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat"
    path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat"

    # Initialize the model，改这里

    # hidden_layer_sizes=(10, 20, 10)：三个隐藏层，分别10、20、10个神经元
    if model_name == "mlp":
        model = MLPClassifier(hidden_layer_sizes=(10, 20),
                              activation='relu',
                              solver='adam',
                              max_iter=200,
                              alpha=0.01,
                              batch_size=256,
                              learning_rate='constant',
                              learning_rate_init=0.001,
                              shuffle=False,
                              random_state=2020,
                              early_stopping=True,
                              validation_fraction=0.2,
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=1e-08,
                              n_iter_no_change=10)
    pass

    if model_name == "svm":
        model = SVC(C=5, random_state=42)  # 出问题了
    pass

    if model_name == "lr":
        model = LogisticRegression(C=5,
                                   penalty='l1',
                                   tol=1e-6,
                                   random_state=42)  # penalty 有l1和l2
    pass

    if model_name == "lgbm":
        model = LGBMClassifier(num_leaves=31,
                               learning_rate=0.1,
                               n_estimators=64,
                               random_state=42,
                               n_jobs=-1)
    pass

    if model_name == "xgb":
        model = XGBClassifier(max_depth=5,
                              learning_rate=0.1,
                              n_jobs=-1,
                              nthread=-1,
                              gamma=0.06,
                              min_child_weight=5,
                              subsample=1,
                              colsample_bytree=0.9,
                              reg_alpha=0,
                              reg_lambda=0.5,
                              random_state=42)
    pass

    if model_name == "ld":
        model = LinearDiscriminantAnalysis(solver='lsqr')
    pass

    if model_name == "rf":
        model = RandomForestClassifier(n_estimators=50,
                                       max_depth=20,
                                       min_samples_split=2,
                                       min_samples_leaf=5,
                                       max_features="log2",
                                       random_state=12)
    pass

    if not (os.path.exists(path_scores_method1)
            and os.path.exists(path_scores_method2)):
        print("dataset: " + graph_name + '----' + "baselines:" +
              emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算')

    if os.path.exists(path_scores_method1) and os.path.exists(
            path_scores_method2):
        # 获取归一化分数
        scores_matrix_one_dict = (loadmat(path_scores_method1))
        scores_matrix_two_dict = (loadmat(path_scores_method2))
        scores_matrix_one = scores_matrix_one_dict['scores']
        scores_matrix_two = scores_matrix_two_dict['scores']
        if emb_method_name1 not in all_embedding_methods:
            scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A,
                                                   k=1))  # k=1表示不包括对角线
        if emb_method_name2 not in all_embedding_methods:
            scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1))
        scores_matrix_one_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_one))
        scores_matrix_two_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_two))

        # 获取train_binary和test_binary
        graph_train_path = get_trainset_path(base_dir=all_file_dir,
                                             graph_name=graph_name,
                                             connected_pattern='undirected',
                                             from_zeros_one='0')
        graph_test_path = get_testset_path(base_dir=all_file_dir,
                                           graph_name=graph_name)
        G = read_graph(weighted=0, input=graph_train_path, directed=0)
        train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
        train_binary = csr_matrix(np.triu(train_binary.A, k=1))
        test_binary = get_test_matrix_binary(graph_test_path=graph_test_path,
                                             N=train_binary.shape[0])

        del scores_matrix_one, scores_matrix_two
        gc.collect()

        # 获取正样本的分数
        exist_binary = csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
        exist_scores_one_list = (np.array(
            scores_matrix_one_norm[exist_binary > 0], dtype=float))[0]
        exist_scores_two_list = (np.array(
            scores_matrix_two_norm[exist_binary > 0], dtype=float))[0]

        # 构建测试样本（正样本+负样本）
        X_train_1 = (np.array([exist_scores_one_list,
                               exist_scores_two_list])).T
        X_train_0 = negative_samples(
            train_binary=train_binary,
            test_binary=test_binary,
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            ratio=ratio)
        Y_train_1 = np.random.randint(1, 2, X_train_1.shape[0])
        Y_train_0 = np.random.randint(0, 1, X_train_0.shape[0])
        X_train = np.vstack((np.array(X_train_1), np.array(X_train_0)))
        Y_train = (np.hstack((np.array(Y_train_1), np.array(Y_train_0)))).T

        time_start = time.time()

        # 模型训练
        model.fit(X_train, Y_train)

        # 模型预测
        preds_0 = model.predict(X_train_0)
        preds_1 = model.predict(X_train_1)
        print(np.sum(preds_0))
        print(np.sum(preds_1))
        preds_0_proba = model.predict_proba(X_train_0)
        preds_1_proba = model.predict_proba(X_train_1)

        # 模型预测
        scores_matrix_DNN = predicted_scores_DNN(
            model=model,
            train_binary=train_binary,
            test_binary=test_binary,
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm)
        save_DNN_hybrid_scores(scores_matrix_DNN=scores_matrix_DNN,
                               method1=emb_method_name1,
                               method2=emb_method_name2,
                               graph_results_dir=graph_results_dir,
                               dataset_name=graph_name,
                               model_name=model_name)
        scores_matrix_DNN_norm = normalize_matrix(
            csr_matrix1=scores_matrix_DNN)

        # 计算DNN的rasterization grids
        DNN_raster_grids = rasterization_grids(
            binNum=DNN_binNum,
            train_binary=train_binary,
            scores_matrix_DNN=scores_matrix_DNN_norm,
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm)
        # DNN_raster_grids = np.log10(DNN_raster_grids) # 出现-inf而报错
        DNN_raster_grids = normalize_matrix_full(
            csr_matrix1=csr_matrix(DNN_raster_grids))
        DNN_raster_grids = better_show_grids(csr_matrix1=DNN_raster_grids)
        save_DNN_raster_scores(rastser_grids=DNN_raster_grids,
                               method1=emb_method_name1,
                               method2=emb_method_name2,
                               graph_results_dir=graph_results_dir,
                               dataset_name=graph_name,
                               model_name=model_name,
                               DNN_binNum=DNN_binNum)
        source = np.float32(DNN_raster_grids.A)
        result = cv2.GaussianBlur(source, (5, 5), 0)
        title = graph_name + '-' + model_name + '-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf(result=result, title=title, binNum=10)

        # 读取PNR grids
        PNR_path = results_base_dir + prex + graph_name + "//" + "PNR1_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        if is_excel_file_exist(PNR_path):
            PNR_dict = (loadmat(PNR_path))
            PNR_matrix = PNR_dict["count"]
            PNR_matrix = better_show_grids(csr_matrix1=PNR_matrix)
            source = np.float32(PNR_matrix.A)
            result = cv2.GaussianBlur(source, (5, 5),
                                      0)  #(5, 5)表示高斯矩阵的长与宽都是5，标准差取0
            title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2
            plot_contourf(result=result, title=title, binNum=10)

        # 评估DNN
        exist_binary = csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
        nonexist_binary = csr_matrix(
            np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)
        nonexist_scores_DNN_list = (np.array(
            scores_matrix_DNN[nonexist_binary > 0], dtype=float))[0]
        L_full = int(np.sum(test_binary))
        L_array = np.array([
            int(L_full / 20),
            int(L_full / 10),
            int(L_full / 5),
            int(L_full / 2), L_full
        ])
        AP_DNN, AUC_DNN, Precision_DNN, Recall_DNN, F1score_DNN = \
            evaluators(train_binary=train_binary,
                       test_binary=test_binary,
                       scores_list=nonexist_scores_DNN_list,
                       L_array=L_array)
        # print('AP_DNN:  ' + str(AP_DNN))
        # print('\n')
        # print('AUC_DNN:  ' + str(AUC_DNN))
        # print('\n')
        # print('Precision_DNN:  ' + str(Precision_DNN))
        # print('\n')
        # print('Recall_DNN:  ' + str(Recall_DNN))
        # print('\n')
        # print('F1score_DNN:  ' + str(F1score_DNN))
        # print('\n')

        # 把precision、recall、F1score、AP写入excel文件
        DNN_write_to_excel(DL_name=model_name,
                           dataset_name=graph_name,
                           method1=emb_method_name1,
                           method2=emb_method_name2,
                           precision_DL=Precision_DNN,
                           recall_DL=Recall_DNN,
                           F1score_DL=F1score_DNN,
                           AP_DL=AP_DNN)

        time_end = time.time()
        print("It takes : " + str((time_end - time_start) / 60.0) + "  mins.")
        pass

Exemple #3

0

Afficher le fichier

                                            train_binary=train_binary,
                                            PNR=PNR2,
                                            interval=interval,
                                            binNum=binNum)




    # weighted hybird方法的分数，0.5均权直接相加
    scores_matrix_hybrid_norm = 0.5 * scores_matrix_one_norm + 0.5 * scores_matrix_two_norm
    nonexist_scores_hybrid_list = (np.array(scores_matrix_hybrid_norm[nonexist_binary > 0], dtype=float))[0]



    # 评估evaluation
    graph_test_path=get_testset_path(base_dir=all_file_dir, graph_name=graph_name)
    test_binary=get_test_matrix_binary(graph_test_path=graph_test_path, N=N)
    L_full = int(np.sum(test_binary))
    L_array = np.array([int(L_full/20),int(L_full/10),
                        int(L_full/5), int(L_full/2),
                        L_full])

    del scores_matrix_one_norm, scores_matrix_two_norm, exist_scores_one_list, exist_scores_two_list, scores_matrix_hybrid_norm
    gc.collect()



    AP_PNR, AUC_PNR, Precision_PNR, Recall_PNR, F1score_PNR=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_PNR_list,

Exemple #4

0

Afficher le fichier

def auto_PNR(prex=None,
             graph_name=None,
             emb_method_name1=None,
             emb_method_name2=None):

    print('----------------------------------------------------------')
    time_start = time.time()
    # 初始化训练集和测试集的路径
    # prex = 'preprocessing_code2//'  # 改这里
    all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex

    binNum = 50  # 改这里

    emb_method_name1 = emb_method_name1.lower()  # 改这里
    emb_method_name2 = emb_method_name2.lower()  # 改这里
    print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 +
          "," + emb_method_name2)
    conf_method1 = None
    conf_method2 = None
    if emb_method_name1 in all_embedding_methods:
        config_path_method1 = 'conf/' + emb_method_name1 + '.properties'
        config_method1 = configparser.ConfigParser()
        config_method1.read(config_path_method1)
        conf_method1 = dict(config_method1.items("hyperparameters"))
    if emb_method_name2 in all_embedding_methods:
        config_path_method2 = 'conf/' + emb_method_name2 + '.properties'
        config_method2 = configparser.ConfigParser()
        config_method2.read(config_path_method2)
        conf_method2 = dict(config_method2.items("hyperparameters"))

    # 初始化embedding和scores的路径
    results_dir = 'D:\hybridrec/results//' + prex
    graph_results_dir = results_dir + graph_name + '//'

    # 计算emb method 1
    if not ((emb_method_name1 == 'arope') or
            (emb_method_name1 == 'graph2gauss') or
            (is_heuristic_method(emb_method_name1) == True)):
        graph_train_path = get_trainset_path(
            base_dir=all_file_dir,
            graph_name=graph_name,
            connected_pattern=get_connp(emb_method_name1),
            from_zeros_one=get_from_zeros_one(emb_method_name1))
        graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name1 + '.emb'
        if not os.path.isfile(graph_results_path):
            run_emb_method(input=graph_train_path,
                           output=graph_results_path,
                           emb_method_name=emb_method_name1)

    # 计算emb method 2
    if not ((emb_method_name2 == 'arope') or
            (emb_method_name2 == 'graph2gauss') or
            (is_heuristic_method(emb_method_name2) == True)):
        graph_train_path = get_trainset_path(
            base_dir=all_file_dir,
            graph_name=graph_name,
            connected_pattern=get_connp(emb_method_name2),
            from_zeros_one=get_from_zeros_one(emb_method_name2))
        graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name2 + '.emb'
        if not os.path.isfile(graph_results_path):
            run_emb_method(input=graph_train_path,
                           output=graph_results_path,
                           emb_method_name=emb_method_name2)

    # 计算scores1
    if conf_method1 != None:
        embedding_size_method1 = int(conf_method1['embedding_size'])
    if emb_method_name1 == 'splitter':
        scores_matrix_one = inner_product_scores_splitter(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name1 == 'attentionwalk') or (emb_method_name1
                                                   == 'grarep'):
        scores_matrix_one = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name1 == 'drne') or (emb_method_name1 == 'prune'):
        scores_matrix_one = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1,
            skiprows=0,
            delimiter=' ')  # embedding_size_method有一些是要+1有一些不需要的
    elif (emb_method_name1 == 'arope'):
        scores_matrix_one = inner_product_scores_arope(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif (emb_method_name1 == 'graph2gauss'):
        scores_matrix_one = energy_kl_scores_graph2gauss(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif is_heuristic_method(emb_method_name1):
        scores_matrix_one = heuristic_scores(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir,
            heuristic_method=emb_method_name1)
    else:
        scores_matrix_one = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1 + 1,
            skiprows=1,
            delimiter=' ')

    # 计算scores2
    if conf_method2 != None:
        embedding_size_method2 = int(conf_method2['embedding_size'])
    if emb_method_name2 == 'splitter':
        scores_matrix_two = inner_product_scores_splitter(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name2 == 'attentionwalk') or (emb_method_name2
                                                   == 'grarep'):
        scores_matrix_two = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name2 == 'drne') or (emb_method_name2 == 'prune'):
        scores_matrix_two = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2,
            skiprows=0,
            delimiter=' ')
    elif (emb_method_name2 == 'arope'):
        scores_matrix_two = inner_product_scores_arope(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif (emb_method_name2 == 'graph2gauss'):
        scores_matrix_two = energy_kl_scores_graph2gauss(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif is_heuristic_method(emb_method_name2):
        scores_matrix_two = heuristic_scores(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir,
            heuristic_method=emb_method_name2)
    else:
        scores_matrix_two = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2 + 1,
            skiprows=1,
            delimiter=' ')

    # scores取上三角（注意:1、前面需要保证所有的分数在右上角或占满整个矩阵。2、前面有些是右上角，有些是占满整个矩阵）
    # scores_matrix_one_full = scores_matrix_one.A
    # scores_matrix_two_full = scores_matrix_two.A
    # plot_matrix(matrix = scores_matrix_one_full)
    # plot_matrix(matrix = scores_matrix_two_full)
    scores_matrix_one = sp.csr_matrix(np.triu(scores_matrix_one.A,
                                              k=1))  # k=1表示不包括对角线
    scores_matrix_two = sp.csr_matrix(np.triu(scores_matrix_two.A, k=1))

    # 读入train的binary数据
    graph_train_path = get_trainset_path(base_dir=all_file_dir,
                                         graph_name=graph_name,
                                         connected_pattern='undirected',
                                         from_zeros_one='0')
    G = read_graph(weighted=0, input=graph_train_path, directed=0)
    train_binary = sp.csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
    train_binary = sp.csr_matrix(np.triu(train_binary.A, k=1))
    # train_binary_full = train_binary.A
    # 或 train_binary = sp.csr_matrix(np.array(nx.to_numpy_matrix(G)))

    # 构建exist和nonexist的binary
    exist_binary = sp.csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
    nonexist_binary = sp.csr_matrix(
        np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)

    # 分数归一化到[0.0, 1.0]
    scores_matrix_one_norm = normalize_matrix(csr_matrix1=scores_matrix_one)
    scores_matrix_two_norm = normalize_matrix(csr_matrix1=scores_matrix_two)
    # plot_matrix(scores_matrix_one_norm.A)
    # plot_matrix(scores_matrix_two_norm.A)

    del scores_matrix_one, scores_matrix_two
    gc.collect()

    # 划分bin
    val_max = 1.0
    val_min = 0.0
    # bin_array = sorted(divide_bin(val_max = val_max, val_min = val_min, binNum = binNum))
    interval = float((val_max - val_min) / binNum)

    # 获取exist_binary和nonexist_binary的分数
    exist_scores_one_list = (np.array(scores_matrix_one_norm[exist_binary > 0],
                                      dtype=float))[0]
    nonexist_scores_one_list = (np.array(
        scores_matrix_one_norm[nonexist_binary > 0], dtype=float))[0]
    exist_scores_two_list = (np.array(scores_matrix_two_norm[exist_binary > 0],
                                      dtype=float))[0]
    nonexist_scores_two_list = (np.array(
        scores_matrix_two_norm[nonexist_binary > 0], dtype=float))[0]
    # # 变为稀疏矩阵
    # exist_scores_one_list_csr = sp.csr_matrix(exist_scores_one_list)
    # nonexist_scores_one_list_csr = sp.csr_matrix(nonexist_scores_one_list)
    # exist_scores_two_list_csr = sp.csr_matrix(exist_scores_two_list)
    # nonexist_scores_two_list_csr = sp.csr_matrix(nonexist_scores_two_list)

    # temp = scores_matrix_one_norm[exist_binary > 0][0] # 我怕在把分数变为list的时候出问题

    # 初始化两个大小为binNum* bnNum的二维栅格
    exist_raster_grids = np.zeros((binNum, binNum))
    nonexist_raster_grids = np.zeros((binNum, binNum))

    # 计算落在exist_raster_grids栅格的existing links的数量
    exist_links_num = len(exist_scores_one_list)
    exist_row_col_zero_num = 0  # 那些两个矩阵的分数都是0的不作统计
    for i in range(exist_links_num):
        # row_index和col_index的范围从0-->binNum-1
        if (exist_scores_one_list[i] == 0.0) & (exist_scores_two_list[i]
                                                == 0.0):
            exist_row_col_zero_num = exist_row_col_zero_num + 1
            continue
        row_index = int(
            get_row_col_index(score=exist_scores_one_list[i],
                              interval=interval,
                              binNum=binNum))
        col_index = int(
            get_row_col_index(score=exist_scores_two_list[i],
                              interval=interval,
                              binNum=binNum))
        exist_raster_grids[row_index,
                           col_index] = exist_raster_grids[row_index,
                                                           col_index] + 1

    print("exist_row_col_zero_num:" + str(exist_row_col_zero_num))
    print('sum  exist_raster_grids:' + str(np.sum(exist_raster_grids)))

    # 计算落在nonexist_raster_grids栅格的nonexisting links的数量
    nonexist_links_num = len(nonexist_scores_one_list)
    nonexist_row_col_zero_num = 0  # 那些两个矩阵的分数都是0的不作统计
    for i in range(nonexist_links_num):
        # row_index和col_index的范围从0-->binNum-1
        if (nonexist_scores_one_list[i] <= 0.0) & (nonexist_scores_two_list[i]
                                                   <= 0.0):
            nonexist_row_col_zero_num = nonexist_row_col_zero_num + 1
            continue
        row_index = int(
            get_row_col_index(score=nonexist_scores_one_list[i],
                              interval=interval,
                              binNum=binNum))
        col_index = int(
            get_row_col_index(score=nonexist_scores_two_list[i],
                              interval=interval,
                              binNum=binNum))

        nonexist_raster_grids[row_index,
                              col_index] = nonexist_raster_grids[row_index,
                                                                 col_index] + 1

    print("nonexist_row_col_zero_num:" + str(nonexist_row_col_zero_num))
    print('sum  nonexist_raster_grids:' + str(np.sum(nonexist_raster_grids)))

    # 计算PNR分数
    N = train_binary.shape[0]
    print("Graph size：" + str(N) + '\n')
    L_T = np.sum(train_binary.A)
    O = N * (N - 1) / 2
    coefficient = (O - L_T) / L_T
    PNR1 = coefficient * (exist_raster_grids / (nonexist_raster_grids + 1)
                          )  # 分母加1避免出现inf或nan，不影响evaluation但是可能好看
    PNR2 = (exist_raster_grids / nonexist_raster_grids)  # inf和nan置为0
    PNR2[np.isnan(PNR2)] = 0
    PNR2[np.isinf(PNR2)] = 0
    PNR2 = coefficient * PNR2

    # 画图（注意：图的横纵坐标是从左上角开始的而不是想象中的左上角）
    # sns.heatmap(PNR1, cmap='Reds')
    # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_' +'bin_' + str(binNum) + "_PNR1.jpg")
    # plt.show()
    # sns.heatmap(PNR2, cmap='Reds')
    # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_'+ 'bin_' + str(binNum) + "_PNR2.jpg")
    # plt.show()
    # plt.matshow(PNR1) # 好丑
    # plt.show()

    # 保存（exist_raster_grids、nonexist_raster_grids、PNR1、PNR2）
    save_ndarray_to_mat(exist_raster_grids, 'exist_raster_grids',
                        graph_results_dir, graph_name, emb_method_name1,
                        emb_method_name2, binNum)
    save_ndarray_to_mat(nonexist_raster_grids, 'nonexist_raster_grids',
                        graph_results_dir, graph_name, emb_method_name1,
                        emb_method_name2, binNum)
    save_ndarray_to_mat(PNR1, 'PNR1', graph_results_dir, graph_name,
                        emb_method_name1, emb_method_name2, binNum)
    save_ndarray_to_mat(PNR2, 'PNR2', graph_results_dir, graph_name,
                        emb_method_name1, emb_method_name2, binNum)

    # PNR调整分数(只调整non-existing link的部分)
    nonexist_scores_PNR_list = transfer_scores_PNR(
        scores_matrix_one_norm=scores_matrix_one_norm,
        scores_matrix_two_norm=scores_matrix_two_norm,
        train_binary=train_binary,
        PNR=PNR2,
        interval=interval,
        binNum=binNum)

    # weighted hybird方法的分数，0.5均权直接相加
    scores_matrix_hybrid_norm = 0.5 * scores_matrix_one_norm + 0.5 * scores_matrix_two_norm
    nonexist_scores_hybrid_list = (np.array(
        scores_matrix_hybrid_norm[nonexist_binary > 0], dtype=float))[0]

    # 评估evaluation
    graph_test_path = get_testset_path(base_dir=all_file_dir,
                                       graph_name=graph_name)
    test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=N)
    L_full = int(np.sum(test_binary))
    L_array = np.array([
        int(L_full / 20),
        int(L_full / 10),
        int(L_full / 5),
        int(L_full / 2), L_full
    ])

    del scores_matrix_one_norm, scores_matrix_two_norm, exist_scores_one_list, exist_scores_two_list, scores_matrix_hybrid_norm
    gc.collect()


    AP_PNR, AUC_PNR, Precision_PNR, Recall_PNR, F1score_PNR=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_PNR_list,
                   L_array=L_array)
    AP_method1, AUC_method1, Precision_method1, Recall_method1, F1score_method1=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_one_list,
                   L_array=L_array)
    AP_method2, AUC_method2, Precision_method2, Recall_method2, F1score_method2=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_two_list,
                   L_array=L_array)
    AP_weighted, AUC_weighted, Precision_weighted, Recall_weighted, F1score_weighted=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_hybrid_list,
                   L_array=L_array)

    print('AP_PNR:  ' + str(AP_PNR))
    print('AP_method1:  ' + str(AP_method1))
    print('AP_method2:  ' + str(AP_method2))
    print('AP_weighted:  ' + str(AP_weighted))
    print('\n')
    print('AUC_PNR:  ' + str(AUC_PNR))
    print('AUC_method1:  ' + str(AUC_method1))
    print('AUC_method2:  ' + str(AUC_method2))
    print('AUC_weighted:  ' + str(AUC_weighted))
    print('\n')
    print('Precision_PNR:  ' + str(Precision_PNR))
    print('Precision_method1:  ' + str(Precision_method1))
    print('Precision_method2:  ' + str(Precision_method2))
    print('Precision_weighted:  ' + str(Precision_weighted))
    print('\n')
    print('Recall_PNR:  ' + str(Recall_PNR))
    print('Recall_method1:  ' + str(Recall_method1))
    print('Recall_method2:  ' + str(Recall_method2))
    print('Recall_weighted:  ' + str(Recall_weighted))
    print('\n')
    print('F1score_PNR:  ' + str(F1score_PNR))
    print('F1score_method1:  ' + str(F1score_method1))
    print('F1score_method2:  ' + str(F1score_method2))
    print('F1score_weighted:  ' + str(F1score_weighted))
    print('\n')

    write_to_excel(graph_name, emb_method_name1, emb_method_name2,
                   Precision_PNR, Precision_method1, Precision_method2,
                   Precision_weighted, Recall_PNR, Recall_method1,
                   Recall_method2, Recall_weighted, F1score_PNR,
                   F1score_method1, F1score_method2, F1score_weighted, AP_PNR,
                   AP_method1, AP_method2, AP_weighted, AUC_PNR, AUC_method1,
                   AUC_method2, AUC_weighted)

    time_end = time.time()
    print("time span:  " + str((time_end - time_start) / 60.00) + "  mins")
    # facebook_combined：bin=5, 1.5分钟
    # facebook_combined：cn和pearson\aa和cn花了3.5分钟
    # facebook_combined：graphdistance和cn花了11分钟
    # facebook_combined: graphdistance和cn的PNE矩阵为全0
    # facebooke_combined: attentionwalk和prone花了7.5分钟
    # facebooke_combined: 有rootedpagerank的效果都很差;
    # arope比PNR好一点，SDNE和PRUE很差很差；drne和graph2gauss也是极差的但是PNR融合后表现极好；

    # blogcatalog:aa和ja花了3小时
    # （path based--katz和graphdistance都十分慢，neighbor based和rank based很快）

    # google 15000 nodes: 2.5小时
    print(
        '--------------------------------------------------------------------------------'
    )
    pass