def auto_overlap(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None, binNum=None): time_start = time.time() print('----------------------------------------------------------') print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) results_base_dir = 'D:\hybridrec//results//' all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat" path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat" if not (os.path.exists(path_scores_method1) and os.path.exists(path_scores_method2)): print("dataset: " + graph_name + '----' + "baselines:" + emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算') if os.path.exists(path_scores_method1) and os.path.exists( path_scores_method2): # 获取归一化分数 scores_matrix_one_dict = (loadmat(path_scores_method1)) scores_matrix_two_dict = (loadmat(path_scores_method2)) scores_matrix_one = scores_matrix_one_dict['scores'] scores_matrix_two = scores_matrix_two_dict['scores'] if emb_method_name1 not in all_embedding_methods: scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 if emb_method_name2 not in all_embedding_methods: scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1)) scores_matrix_one_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_one)) # 去掉传参的csr_matrix()则会 scores_matrix_two_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_two)) # 获取train_binary和test_binary graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = csr_matrix(np.triu(train_binary.A, k=1)) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=train_binary.shape[0]) # 读取plus的原始分数(未归一化) plus_scores_name = 'plus_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat' plus_scores_path = graph_results_dir + plus_scores_name scores_matrix_plus_dict = (loadmat(plus_scores_path)) scores_matrix_plus = scores_matrix_plus_dict['scores'] # 读取multiply的原始分数(未归一化) multiply_scores_name = 'multiply_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat' multiply_scores_path = graph_results_dir + multiply_scores_name scores_matrix_multiply_dict = (loadmat(multiply_scores_path)) scores_matrix_multiply = scores_matrix_multiply_dict['scores'] # 读取MLP的原始分数(未归一化) mlp_scores_name = 'mlp_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat' mlp_scores_path = graph_results_dir + mlp_scores_name scores_matrix_mlp_dict = (loadmat(mlp_scores_path)) scores_matrix_mlp = scores_matrix_mlp_dict['scores'] # 归一化hybrid分数 scores_matrix_plus_norm = normalize_matrix( csr_matrix1=scores_matrix_plus) scores_matrix_multiply_norm = normalize_matrix( csr_matrix1=scores_matrix_multiply) scores_matrix_mlp_norm = normalize_matrix( csr_matrix1=scores_matrix_mlp) # 计算plus、multiply、mlp、PNR的rasterization grids mlp_path = results_base_dir + prex + graph_name + "//" + "mlp_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" mlp_dict = (loadmat(mlp_path)) mlp_raster_grids = mlp_dict["count"] multiply_path = results_base_dir + prex + graph_name + "//" + "multiply_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" multiply_dict = (loadmat(multiply_path)) multiply_raster_grids = multiply_dict["count"] plus_path = results_base_dir + prex + graph_name + "//" + "plus_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" plus_dict = (loadmat(plus_path)) plus_raster_grids = plus_dict["count"] # plus_raster_grids = rasterization_grids(binNum=binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_plus_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) # multiply_raster_grids = rasterization_grids(binNum=binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_multiply_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) # mlp_raster_grids = rasterization_grids(binNum=binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_mlp_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) PNR_path = results_base_dir + prex + graph_name + "//" + "PNR2_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" PNR_dict = (loadmat(PNR_path)) PNR_raster_grids = PNR_dict["count"] exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) # 获取plus的nonexist_scores_list nonexist_scores_plus_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=plus_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取multiply的nonexist_scores_list nonexist_scores_multiply_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=multiply_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取mlp的nonexist_scores_list nonexist_scores_mlp_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=mlp_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取PNR的nonexist_scores_list nonexist_scores_PNR_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=PNR_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取阈值 E_test = np.sum(test_binary.A) thresold_plus = get_list_thresold(nonexist_scores_plus_list, L=E_test) thresold_multiply = get_list_thresold(nonexist_scores_multiply_list, L=E_test) thresold_mlp = get_list_thresold(nonexist_scores_mlp_list, L=E_test) thresold_PNR = get_list_thresold(nonexist_scores_PNR_list, L=E_test) # 这里的trick, L=1/2 |E_test|!!!!!!!!!!! # thresold_plus = int(thresold_plus*0.5) # thresold_multiply = int(thresold_multiply * 0.5) # thresold_mlp = int(thresold_mlp * 0.5) # thresold_PNR = int(thresold_PNR * 0.5) # 修改grids plus_raster_grids = plus_raster_grids.A multiply_raster_grids = multiply_raster_grids.A mlp_raster_grids = mlp_raster_grids.A PNR_raster_grids = PNR_raster_grids.A # np.where(plus_raster_grids > thresold_plus, plus_raster_grids, 0) # np.where(multiply_raster_grids > thresold_multiply, multiply_raster_grids, 0) # np.where(mlp_raster_grids > thresold_mlp, mlp_raster_grids, 0) # np.where(PNR_raster_grids > thresold_PNR, PNR_raster_grids, 0) plus_raster_grids[plus_raster_grids <= thresold_plus] = 0.0 multiply_raster_grids[multiply_raster_grids <= thresold_multiply] = 0.0 mlp_raster_grids[mlp_raster_grids <= thresold_mlp] = 0.0 PNR_raster_grids[PNR_raster_grids <= thresold_PNR] = 0.0 plus_raster_grids[plus_raster_grids >= thresold_plus] = 1.0 multiply_raster_grids[multiply_raster_grids >= thresold_multiply] = 1.0 mlp_raster_grids[mlp_raster_grids >= thresold_mlp] = 1.0 PNR_raster_grids[PNR_raster_grids >= thresold_PNR] = 1.0 # 画图 # colors = ['OrangeRed', 'darkseagreen', 'dodgerblue', 'blueviolet'] colors = ['Red', 'green', 'blue', 'purple'] result = np.float32(PNR_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[0]) result = np.float32(plus_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-plus-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[1]) result = np.float32(multiply_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-multiply-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[2]) result = np.float32(mlp_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-mlp-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[3]) # # 计算plus的rasterization grids # plus_raster_grids = rasterization_grids(binNum=plus_binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_plus_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) # # plus_raster_grids = np.log10(plus_raster_grids) # 出现-inf而报错 # plus_raster_grids = normalize_matrix_full(csr_matrix1=csr_matrix(plus_raster_grids)) # plus_raster_grids = better_show_grids(csr_matrix1=plus_raster_grids) # # source = np.float32(plus_raster_grids.A) # result = cv2.GaussianBlur(source, (5, 5), 0) # title = graph_name + '-' + 'plus' +'-' + emb_method_name1 + '-' + emb_method_name2 # plot_contourf(result=result, title=title, binNum=10) # time_end = time.time() print("It takes : " + str((time_end - time_start) / 60.0) + " mins.") pass
def auto_DNN(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None, model_name=None, DNN_binNum=None): print('----------------------------------------------------------') print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) results_base_dir = 'D:\hybridrec//results//' all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' # (facebook_combined的规律:ratio越小则正负样本的预测准确率越高,花的时间也越少) ratio = 1 # 负样本的总数是正样 本的ratio倍 # 改这里 path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat" path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat" # Initialize the model,改这里 # hidden_layer_sizes=(10, 20, 10):三个隐藏层,分别10、20、10个神经元 if model_name == "mlp": model = MLPClassifier(hidden_layer_sizes=(10, 20), activation='relu', solver='adam', max_iter=200, alpha=0.01, batch_size=256, learning_rate='constant', learning_rate_init=0.001, shuffle=False, random_state=2020, early_stopping=True, validation_fraction=0.2, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) pass if model_name == "svm": model = SVC(C=5, random_state=42) # 出问题了 pass if model_name == "lr": model = LogisticRegression(C=5, penalty='l1', tol=1e-6, random_state=42) # penalty 有l1和l2 pass if model_name == "lgbm": model = LGBMClassifier(num_leaves=31, learning_rate=0.1, n_estimators=64, random_state=42, n_jobs=-1) pass if model_name == "xgb": model = XGBClassifier(max_depth=5, learning_rate=0.1, n_jobs=-1, nthread=-1, gamma=0.06, min_child_weight=5, subsample=1, colsample_bytree=0.9, reg_alpha=0, reg_lambda=0.5, random_state=42) pass if model_name == "ld": model = LinearDiscriminantAnalysis(solver='lsqr') pass if model_name == "rf": model = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_split=2, min_samples_leaf=5, max_features="log2", random_state=12) pass if not (os.path.exists(path_scores_method1) and os.path.exists(path_scores_method2)): print("dataset: " + graph_name + '----' + "baselines:" + emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算') if os.path.exists(path_scores_method1) and os.path.exists( path_scores_method2): # 获取归一化分数 scores_matrix_one_dict = (loadmat(path_scores_method1)) scores_matrix_two_dict = (loadmat(path_scores_method2)) scores_matrix_one = scores_matrix_one_dict['scores'] scores_matrix_two = scores_matrix_two_dict['scores'] if emb_method_name1 not in all_embedding_methods: scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 if emb_method_name2 not in all_embedding_methods: scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1)) scores_matrix_one_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_one)) scores_matrix_two_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_two)) # 获取train_binary和test_binary graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = csr_matrix(np.triu(train_binary.A, k=1)) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=train_binary.shape[0]) del scores_matrix_one, scores_matrix_two gc.collect() # 获取正样本的分数 exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 exist_scores_one_list = (np.array( scores_matrix_one_norm[exist_binary > 0], dtype=float))[0] exist_scores_two_list = (np.array( scores_matrix_two_norm[exist_binary > 0], dtype=float))[0] # 构建测试样本(正样本+负样本) X_train_1 = (np.array([exist_scores_one_list, exist_scores_two_list])).T X_train_0 = negative_samples( train_binary=train_binary, test_binary=test_binary, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, ratio=ratio) Y_train_1 = np.random.randint(1, 2, X_train_1.shape[0]) Y_train_0 = np.random.randint(0, 1, X_train_0.shape[0]) X_train = np.vstack((np.array(X_train_1), np.array(X_train_0))) Y_train = (np.hstack((np.array(Y_train_1), np.array(Y_train_0)))).T time_start = time.time() # 模型训练 model.fit(X_train, Y_train) # 模型预测 preds_0 = model.predict(X_train_0) preds_1 = model.predict(X_train_1) print(np.sum(preds_0)) print(np.sum(preds_1)) preds_0_proba = model.predict_proba(X_train_0) preds_1_proba = model.predict_proba(X_train_1) # 模型预测 scores_matrix_DNN = predicted_scores_DNN( model=model, train_binary=train_binary, test_binary=test_binary, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm) save_DNN_hybrid_scores(scores_matrix_DNN=scores_matrix_DNN, method1=emb_method_name1, method2=emb_method_name2, graph_results_dir=graph_results_dir, dataset_name=graph_name, model_name=model_name) scores_matrix_DNN_norm = normalize_matrix( csr_matrix1=scores_matrix_DNN) # 计算DNN的rasterization grids DNN_raster_grids = rasterization_grids( binNum=DNN_binNum, train_binary=train_binary, scores_matrix_DNN=scores_matrix_DNN_norm, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm) # DNN_raster_grids = np.log10(DNN_raster_grids) # 出现-inf而报错 DNN_raster_grids = normalize_matrix_full( csr_matrix1=csr_matrix(DNN_raster_grids)) DNN_raster_grids = better_show_grids(csr_matrix1=DNN_raster_grids) save_DNN_raster_scores(rastser_grids=DNN_raster_grids, method1=emb_method_name1, method2=emb_method_name2, graph_results_dir=graph_results_dir, dataset_name=graph_name, model_name=model_name, DNN_binNum=DNN_binNum) source = np.float32(DNN_raster_grids.A) result = cv2.GaussianBlur(source, (5, 5), 0) title = graph_name + '-' + model_name + '-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf(result=result, title=title, binNum=10) # 读取PNR grids PNR_path = results_base_dir + prex + graph_name + "//" + "PNR1_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" if is_excel_file_exist(PNR_path): PNR_dict = (loadmat(PNR_path)) PNR_matrix = PNR_dict["count"] PNR_matrix = better_show_grids(csr_matrix1=PNR_matrix) source = np.float32(PNR_matrix.A) result = cv2.GaussianBlur(source, (5, 5), 0) #(5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf(result=result, title=title, binNum=10) # 评估DNN exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) nonexist_scores_DNN_list = (np.array( scores_matrix_DNN[nonexist_binary > 0], dtype=float))[0] L_full = int(np.sum(test_binary)) L_array = np.array([ int(L_full / 20), int(L_full / 10), int(L_full / 5), int(L_full / 2), L_full ]) AP_DNN, AUC_DNN, Precision_DNN, Recall_DNN, F1score_DNN = \ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_DNN_list, L_array=L_array) # print('AP_DNN: ' + str(AP_DNN)) # print('\n') # print('AUC_DNN: ' + str(AUC_DNN)) # print('\n') # print('Precision_DNN: ' + str(Precision_DNN)) # print('\n') # print('Recall_DNN: ' + str(Recall_DNN)) # print('\n') # print('F1score_DNN: ' + str(F1score_DNN)) # print('\n') # 把precision、recall、F1score、AP写入excel文件 DNN_write_to_excel(DL_name=model_name, dataset_name=graph_name, method1=emb_method_name1, method2=emb_method_name2, precision_DL=Precision_DNN, recall_DL=Recall_DNN, F1score_DL=F1score_DNN, AP_DL=AP_DNN) time_end = time.time() print("It takes : " + str((time_end - time_start) / 60.0) + " mins.") pass
train_binary=train_binary, PNR=PNR2, interval=interval, binNum=binNum) # weighted hybird方法的分数,0.5均权直接相加 scores_matrix_hybrid_norm = 0.5 * scores_matrix_one_norm + 0.5 * scores_matrix_two_norm nonexist_scores_hybrid_list = (np.array(scores_matrix_hybrid_norm[nonexist_binary > 0], dtype=float))[0] # 评估evaluation graph_test_path=get_testset_path(base_dir=all_file_dir, graph_name=graph_name) test_binary=get_test_matrix_binary(graph_test_path=graph_test_path, N=N) L_full = int(np.sum(test_binary)) L_array = np.array([int(L_full/20),int(L_full/10), int(L_full/5), int(L_full/2), L_full]) del scores_matrix_one_norm, scores_matrix_two_norm, exist_scores_one_list, exist_scores_two_list, scores_matrix_hybrid_norm gc.collect() AP_PNR, AUC_PNR, Precision_PNR, Recall_PNR, F1score_PNR=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_PNR_list,
def auto_PNR(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None): print('----------------------------------------------------------') time_start = time.time() # 初始化训练集和测试集的路径 # prex = 'preprocessing_code2//' # 改这里 all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex binNum = 50 # 改这里 emb_method_name1 = emb_method_name1.lower() # 改这里 emb_method_name2 = emb_method_name2.lower() # 改这里 print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) conf_method1 = None conf_method2 = None if emb_method_name1 in all_embedding_methods: config_path_method1 = 'conf/' + emb_method_name1 + '.properties' config_method1 = configparser.ConfigParser() config_method1.read(config_path_method1) conf_method1 = dict(config_method1.items("hyperparameters")) if emb_method_name2 in all_embedding_methods: config_path_method2 = 'conf/' + emb_method_name2 + '.properties' config_method2 = configparser.ConfigParser() config_method2.read(config_path_method2) conf_method2 = dict(config_method2.items("hyperparameters")) # 初始化embedding和scores的路径 results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' # 计算emb method 1 if not ((emb_method_name1 == 'arope') or (emb_method_name1 == 'graph2gauss') or (is_heuristic_method(emb_method_name1) == True)): graph_train_path = get_trainset_path( base_dir=all_file_dir, graph_name=graph_name, connected_pattern=get_connp(emb_method_name1), from_zeros_one=get_from_zeros_one(emb_method_name1)) graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name1 + '.emb' if not os.path.isfile(graph_results_path): run_emb_method(input=graph_train_path, output=graph_results_path, emb_method_name=emb_method_name1) # 计算emb method 2 if not ((emb_method_name2 == 'arope') or (emb_method_name2 == 'graph2gauss') or (is_heuristic_method(emb_method_name2) == True)): graph_train_path = get_trainset_path( base_dir=all_file_dir, graph_name=graph_name, connected_pattern=get_connp(emb_method_name2), from_zeros_one=get_from_zeros_one(emb_method_name2)) graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name2 + '.emb' if not os.path.isfile(graph_results_path): run_emb_method(input=graph_train_path, output=graph_results_path, emb_method_name=emb_method_name2) # 计算scores1 if conf_method1 != None: embedding_size_method1 = int(conf_method1['embedding_size']) if emb_method_name1 == 'splitter': scores_matrix_one = inner_product_scores_splitter( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1 + 1, skiprows=1, delimiter=',') elif (emb_method_name1 == 'attentionwalk') or (emb_method_name1 == 'grarep'): scores_matrix_one = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1 + 1, skiprows=1, delimiter=',') elif (emb_method_name1 == 'drne') or (emb_method_name1 == 'prune'): scores_matrix_one = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1, skiprows=0, delimiter=' ') # embedding_size_method有一些是要+1有一些不需要的 elif (emb_method_name1 == 'arope'): scores_matrix_one = inner_product_scores_arope( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif (emb_method_name1 == 'graph2gauss'): scores_matrix_one = energy_kl_scores_graph2gauss( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif is_heuristic_method(emb_method_name1): scores_matrix_one = heuristic_scores( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir, heuristic_method=emb_method_name1) else: scores_matrix_one = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1 + 1, skiprows=1, delimiter=' ') # 计算scores2 if conf_method2 != None: embedding_size_method2 = int(conf_method2['embedding_size']) if emb_method_name2 == 'splitter': scores_matrix_two = inner_product_scores_splitter( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2 + 1, skiprows=1, delimiter=',') elif (emb_method_name2 == 'attentionwalk') or (emb_method_name2 == 'grarep'): scores_matrix_two = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2 + 1, skiprows=1, delimiter=',') elif (emb_method_name2 == 'drne') or (emb_method_name2 == 'prune'): scores_matrix_two = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2, skiprows=0, delimiter=' ') elif (emb_method_name2 == 'arope'): scores_matrix_two = inner_product_scores_arope( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif (emb_method_name2 == 'graph2gauss'): scores_matrix_two = energy_kl_scores_graph2gauss( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif is_heuristic_method(emb_method_name2): scores_matrix_two = heuristic_scores( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir, heuristic_method=emb_method_name2) else: scores_matrix_two = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2 + 1, skiprows=1, delimiter=' ') # scores取上三角(注意:1、前面需要保证所有的分数在右上角或占满整个矩阵。2、前面有些是右上角,有些是占满整个矩阵) # scores_matrix_one_full = scores_matrix_one.A # scores_matrix_two_full = scores_matrix_two.A # plot_matrix(matrix = scores_matrix_one_full) # plot_matrix(matrix = scores_matrix_two_full) scores_matrix_one = sp.csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 scores_matrix_two = sp.csr_matrix(np.triu(scores_matrix_two.A, k=1)) # 读入train的binary数据 graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = sp.csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # train_binary_full = train_binary.A # 或 train_binary = sp.csr_matrix(np.array(nx.to_numpy_matrix(G))) # 构建exist和nonexist的binary exist_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = sp.csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) # 分数归一化到[0.0, 1.0] scores_matrix_one_norm = normalize_matrix(csr_matrix1=scores_matrix_one) scores_matrix_two_norm = normalize_matrix(csr_matrix1=scores_matrix_two) # plot_matrix(scores_matrix_one_norm.A) # plot_matrix(scores_matrix_two_norm.A) del scores_matrix_one, scores_matrix_two gc.collect() # 划分bin val_max = 1.0 val_min = 0.0 # bin_array = sorted(divide_bin(val_max = val_max, val_min = val_min, binNum = binNum)) interval = float((val_max - val_min) / binNum) # 获取exist_binary和nonexist_binary的分数 exist_scores_one_list = (np.array(scores_matrix_one_norm[exist_binary > 0], dtype=float))[0] nonexist_scores_one_list = (np.array( scores_matrix_one_norm[nonexist_binary > 0], dtype=float))[0] exist_scores_two_list = (np.array(scores_matrix_two_norm[exist_binary > 0], dtype=float))[0] nonexist_scores_two_list = (np.array( scores_matrix_two_norm[nonexist_binary > 0], dtype=float))[0] # # 变为稀疏矩阵 # exist_scores_one_list_csr = sp.csr_matrix(exist_scores_one_list) # nonexist_scores_one_list_csr = sp.csr_matrix(nonexist_scores_one_list) # exist_scores_two_list_csr = sp.csr_matrix(exist_scores_two_list) # nonexist_scores_two_list_csr = sp.csr_matrix(nonexist_scores_two_list) # temp = scores_matrix_one_norm[exist_binary > 0][0] # 我怕在把分数变为list的时候出问题 # 初始化两个大小为binNum* bnNum的二维栅格 exist_raster_grids = np.zeros((binNum, binNum)) nonexist_raster_grids = np.zeros((binNum, binNum)) # 计算落在exist_raster_grids栅格的existing links的数量 exist_links_num = len(exist_scores_one_list) exist_row_col_zero_num = 0 # 那些两个矩阵的分数都是0的不作统计 for i in range(exist_links_num): # row_index和col_index的范围从0-->binNum-1 if (exist_scores_one_list[i] == 0.0) & (exist_scores_two_list[i] == 0.0): exist_row_col_zero_num = exist_row_col_zero_num + 1 continue row_index = int( get_row_col_index(score=exist_scores_one_list[i], interval=interval, binNum=binNum)) col_index = int( get_row_col_index(score=exist_scores_two_list[i], interval=interval, binNum=binNum)) exist_raster_grids[row_index, col_index] = exist_raster_grids[row_index, col_index] + 1 print("exist_row_col_zero_num:" + str(exist_row_col_zero_num)) print('sum exist_raster_grids:' + str(np.sum(exist_raster_grids))) # 计算落在nonexist_raster_grids栅格的nonexisting links的数量 nonexist_links_num = len(nonexist_scores_one_list) nonexist_row_col_zero_num = 0 # 那些两个矩阵的分数都是0的不作统计 for i in range(nonexist_links_num): # row_index和col_index的范围从0-->binNum-1 if (nonexist_scores_one_list[i] <= 0.0) & (nonexist_scores_two_list[i] <= 0.0): nonexist_row_col_zero_num = nonexist_row_col_zero_num + 1 continue row_index = int( get_row_col_index(score=nonexist_scores_one_list[i], interval=interval, binNum=binNum)) col_index = int( get_row_col_index(score=nonexist_scores_two_list[i], interval=interval, binNum=binNum)) nonexist_raster_grids[row_index, col_index] = nonexist_raster_grids[row_index, col_index] + 1 print("nonexist_row_col_zero_num:" + str(nonexist_row_col_zero_num)) print('sum nonexist_raster_grids:' + str(np.sum(nonexist_raster_grids))) # 计算PNR分数 N = train_binary.shape[0] print("Graph size:" + str(N) + '\n') L_T = np.sum(train_binary.A) O = N * (N - 1) / 2 coefficient = (O - L_T) / L_T PNR1 = coefficient * (exist_raster_grids / (nonexist_raster_grids + 1) ) # 分母加1避免出现inf或nan,不影响evaluation但是可能好看 PNR2 = (exist_raster_grids / nonexist_raster_grids) # inf和nan置为0 PNR2[np.isnan(PNR2)] = 0 PNR2[np.isinf(PNR2)] = 0 PNR2 = coefficient * PNR2 # 画图(注意:图的横纵坐标是从左上角开始的而不是想象中的左上角) # sns.heatmap(PNR1, cmap='Reds') # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_' +'bin_' + str(binNum) + "_PNR1.jpg") # plt.show() # sns.heatmap(PNR2, cmap='Reds') # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_'+ 'bin_' + str(binNum) + "_PNR2.jpg") # plt.show() # plt.matshow(PNR1) # 好丑 # plt.show() # 保存(exist_raster_grids、nonexist_raster_grids、PNR1、PNR2) save_ndarray_to_mat(exist_raster_grids, 'exist_raster_grids', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) save_ndarray_to_mat(nonexist_raster_grids, 'nonexist_raster_grids', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) save_ndarray_to_mat(PNR1, 'PNR1', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) save_ndarray_to_mat(PNR2, 'PNR2', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) # PNR调整分数(只调整non-existing link的部分) nonexist_scores_PNR_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=PNR2, interval=interval, binNum=binNum) # weighted hybird方法的分数,0.5均权直接相加 scores_matrix_hybrid_norm = 0.5 * scores_matrix_one_norm + 0.5 * scores_matrix_two_norm nonexist_scores_hybrid_list = (np.array( scores_matrix_hybrid_norm[nonexist_binary > 0], dtype=float))[0] # 评估evaluation graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=N) L_full = int(np.sum(test_binary)) L_array = np.array([ int(L_full / 20), int(L_full / 10), int(L_full / 5), int(L_full / 2), L_full ]) del scores_matrix_one_norm, scores_matrix_two_norm, exist_scores_one_list, exist_scores_two_list, scores_matrix_hybrid_norm gc.collect() AP_PNR, AUC_PNR, Precision_PNR, Recall_PNR, F1score_PNR=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_PNR_list, L_array=L_array) AP_method1, AUC_method1, Precision_method1, Recall_method1, F1score_method1=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_one_list, L_array=L_array) AP_method2, AUC_method2, Precision_method2, Recall_method2, F1score_method2=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_two_list, L_array=L_array) AP_weighted, AUC_weighted, Precision_weighted, Recall_weighted, F1score_weighted=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_hybrid_list, L_array=L_array) print('AP_PNR: ' + str(AP_PNR)) print('AP_method1: ' + str(AP_method1)) print('AP_method2: ' + str(AP_method2)) print('AP_weighted: ' + str(AP_weighted)) print('\n') print('AUC_PNR: ' + str(AUC_PNR)) print('AUC_method1: ' + str(AUC_method1)) print('AUC_method2: ' + str(AUC_method2)) print('AUC_weighted: ' + str(AUC_weighted)) print('\n') print('Precision_PNR: ' + str(Precision_PNR)) print('Precision_method1: ' + str(Precision_method1)) print('Precision_method2: ' + str(Precision_method2)) print('Precision_weighted: ' + str(Precision_weighted)) print('\n') print('Recall_PNR: ' + str(Recall_PNR)) print('Recall_method1: ' + str(Recall_method1)) print('Recall_method2: ' + str(Recall_method2)) print('Recall_weighted: ' + str(Recall_weighted)) print('\n') print('F1score_PNR: ' + str(F1score_PNR)) print('F1score_method1: ' + str(F1score_method1)) print('F1score_method2: ' + str(F1score_method2)) print('F1score_weighted: ' + str(F1score_weighted)) print('\n') write_to_excel(graph_name, emb_method_name1, emb_method_name2, Precision_PNR, Precision_method1, Precision_method2, Precision_weighted, Recall_PNR, Recall_method1, Recall_method2, Recall_weighted, F1score_PNR, F1score_method1, F1score_method2, F1score_weighted, AP_PNR, AP_method1, AP_method2, AP_weighted, AUC_PNR, AUC_method1, AUC_method2, AUC_weighted) time_end = time.time() print("time span: " + str((time_end - time_start) / 60.00) + " mins") # facebook_combined:bin=5, 1.5分钟 # facebook_combined:cn和pearson\aa和cn花了3.5分钟 # facebook_combined:graphdistance和cn花了11分钟 # facebook_combined: graphdistance和cn的PNE矩阵为全0 # facebooke_combined: attentionwalk和prone花了7.5分钟 # facebooke_combined: 有rootedpagerank的效果都很差; # arope比PNR好一点,SDNE和PRUE很差很差;drne和graph2gauss也是极差的但是PNR融合后表现极好; # blogcatalog:aa和ja花了3小时 # (path based--katz和graphdistance都十分慢,neighbor based和rank based很快) # google 15000 nodes: 2.5小时 print( '--------------------------------------------------------------------------------' ) pass