def auto_DNN(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None, model_name=None, DNN_binNum=None): print('----------------------------------------------------------') print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) results_base_dir = 'D:\hybridrec//results//' all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' # (facebook_combined的规律:ratio越小则正负样本的预测准确率越高,花的时间也越少) ratio = 1 # 负样本的总数是正样 本的ratio倍 # 改这里 path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat" path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat" # Initialize the model,改这里 # hidden_layer_sizes=(10, 20, 10):三个隐藏层,分别10、20、10个神经元 if model_name == "mlp": model = MLPClassifier(hidden_layer_sizes=(10, 20), activation='relu', solver='adam', max_iter=200, alpha=0.01, batch_size=256, learning_rate='constant', learning_rate_init=0.001, shuffle=False, random_state=2020, early_stopping=True, validation_fraction=0.2, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) pass if model_name == "svm": model = SVC(C=5, random_state=42) # 出问题了 pass if model_name == "lr": model = LogisticRegression(C=5, penalty='l1', tol=1e-6, random_state=42) # penalty 有l1和l2 pass if model_name == "lgbm": model = LGBMClassifier(num_leaves=31, learning_rate=0.1, n_estimators=64, random_state=42, n_jobs=-1) pass if model_name == "xgb": model = XGBClassifier(max_depth=5, learning_rate=0.1, n_jobs=-1, nthread=-1, gamma=0.06, min_child_weight=5, subsample=1, colsample_bytree=0.9, reg_alpha=0, reg_lambda=0.5, random_state=42) pass if model_name == "ld": model = LinearDiscriminantAnalysis(solver='lsqr') pass if model_name == "rf": model = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_split=2, min_samples_leaf=5, max_features="log2", random_state=12) pass if not (os.path.exists(path_scores_method1) and os.path.exists(path_scores_method2)): print("dataset: " + graph_name + '----' + "baselines:" + emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算') if os.path.exists(path_scores_method1) and os.path.exists( path_scores_method2): # 获取归一化分数 scores_matrix_one_dict = (loadmat(path_scores_method1)) scores_matrix_two_dict = (loadmat(path_scores_method2)) scores_matrix_one = scores_matrix_one_dict['scores'] scores_matrix_two = scores_matrix_two_dict['scores'] if emb_method_name1 not in all_embedding_methods: scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 if emb_method_name2 not in all_embedding_methods: scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1)) scores_matrix_one_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_one)) scores_matrix_two_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_two)) # 获取train_binary和test_binary graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = csr_matrix(np.triu(train_binary.A, k=1)) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=train_binary.shape[0]) del scores_matrix_one, scores_matrix_two gc.collect() # 获取正样本的分数 exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 exist_scores_one_list = (np.array( scores_matrix_one_norm[exist_binary > 0], dtype=float))[0] exist_scores_two_list = (np.array( scores_matrix_two_norm[exist_binary > 0], dtype=float))[0] # 构建测试样本(正样本+负样本) X_train_1 = (np.array([exist_scores_one_list, exist_scores_two_list])).T X_train_0 = negative_samples( train_binary=train_binary, test_binary=test_binary, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, ratio=ratio) Y_train_1 = np.random.randint(1, 2, X_train_1.shape[0]) Y_train_0 = np.random.randint(0, 1, X_train_0.shape[0]) X_train = np.vstack((np.array(X_train_1), np.array(X_train_0))) Y_train = (np.hstack((np.array(Y_train_1), np.array(Y_train_0)))).T time_start = time.time() # 模型训练 model.fit(X_train, Y_train) # 模型预测 preds_0 = model.predict(X_train_0) preds_1 = model.predict(X_train_1) print(np.sum(preds_0)) print(np.sum(preds_1)) preds_0_proba = model.predict_proba(X_train_0) preds_1_proba = model.predict_proba(X_train_1) # 模型预测 scores_matrix_DNN = predicted_scores_DNN( model=model, train_binary=train_binary, test_binary=test_binary, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm) save_DNN_hybrid_scores(scores_matrix_DNN=scores_matrix_DNN, method1=emb_method_name1, method2=emb_method_name2, graph_results_dir=graph_results_dir, dataset_name=graph_name, model_name=model_name) scores_matrix_DNN_norm = normalize_matrix( csr_matrix1=scores_matrix_DNN) # 计算DNN的rasterization grids DNN_raster_grids = rasterization_grids( binNum=DNN_binNum, train_binary=train_binary, scores_matrix_DNN=scores_matrix_DNN_norm, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm) # DNN_raster_grids = np.log10(DNN_raster_grids) # 出现-inf而报错 DNN_raster_grids = normalize_matrix_full( csr_matrix1=csr_matrix(DNN_raster_grids)) DNN_raster_grids = better_show_grids(csr_matrix1=DNN_raster_grids) save_DNN_raster_scores(rastser_grids=DNN_raster_grids, method1=emb_method_name1, method2=emb_method_name2, graph_results_dir=graph_results_dir, dataset_name=graph_name, model_name=model_name, DNN_binNum=DNN_binNum) source = np.float32(DNN_raster_grids.A) result = cv2.GaussianBlur(source, (5, 5), 0) title = graph_name + '-' + model_name + '-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf(result=result, title=title, binNum=10) # 读取PNR grids PNR_path = results_base_dir + prex + graph_name + "//" + "PNR1_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" if is_excel_file_exist(PNR_path): PNR_dict = (loadmat(PNR_path)) PNR_matrix = PNR_dict["count"] PNR_matrix = better_show_grids(csr_matrix1=PNR_matrix) source = np.float32(PNR_matrix.A) result = cv2.GaussianBlur(source, (5, 5), 0) #(5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf(result=result, title=title, binNum=10) # 评估DNN exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) nonexist_scores_DNN_list = (np.array( scores_matrix_DNN[nonexist_binary > 0], dtype=float))[0] L_full = int(np.sum(test_binary)) L_array = np.array([ int(L_full / 20), int(L_full / 10), int(L_full / 5), int(L_full / 2), L_full ]) AP_DNN, AUC_DNN, Precision_DNN, Recall_DNN, F1score_DNN = \ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_DNN_list, L_array=L_array) # print('AP_DNN: ' + str(AP_DNN)) # print('\n') # print('AUC_DNN: ' + str(AUC_DNN)) # print('\n') # print('Precision_DNN: ' + str(Precision_DNN)) # print('\n') # print('Recall_DNN: ' + str(Recall_DNN)) # print('\n') # print('F1score_DNN: ' + str(F1score_DNN)) # print('\n') # 把precision、recall、F1score、AP写入excel文件 DNN_write_to_excel(DL_name=model_name, dataset_name=graph_name, method1=emb_method_name1, method2=emb_method_name2, precision_DL=Precision_DNN, recall_DL=Recall_DNN, F1score_DL=F1score_DNN, AP_DL=AP_DNN) time_end = time.time() print("It takes : " + str((time_end - time_start) / 60.0) + " mins.") pass
# 第二级:ProNE(IJCAI,2019,有两个emb可以作为优化空间,我看了原文,应该是用enhanced)、 # AttentionWalk(NIPS, 2018) # struc2vec(KDD,2017)、SDNE(KDD,2016)、GraRep(CIKM,2015)、 # LINE(WWW,2015)、 deepwalk(KDD,2014) # 第三级:Prune(NIPS,2017)、node2vec(KDD,2016) ############################### 第二个baseline########################## # 第一级:cn, ja, aa, ra, cosine, pearson, degreeproduct(没有其他超参数,分数是定的) # 第二级:simrank, (暂时不跑)rootedpagerank(很慢,而且效果很差) # (效果很差,暂时不跑)第三级:katz(很慢),graphdistance # 第四级:community # # # micro for i in range(len(graph_group_micro)): graph_name = graph_group_micro[i] if is_excel_file_exist(get_excel_save_path(dataset_name=graph_name, method1=emb_method_name1, method2=emb_method_name2))or \ is_excel_file_exist(get_excel_save_path(dataset_name=graph_name, method1=emb_method_name2, method2=emb_method_name1)):# 有些xls文件的命名可能是两个method调换了 print(graph_name + '-' + emb_method_name1 + '-' + emb_method_name2 + ": existed...") continue auto_PNR(prex=prex, graph_name=graph_name, emb_method_name1=emb_method_name1, emb_method_name2=emb_method_name2) pass # # min for i in range(len(graph_group_min)): graph_name = graph_group_min[i] if is_excel_file_exist(get_excel_save_path(dataset_name=graph_name, method1=emb_method_name1, method2=emb_method_name2))or \ is_excel_file_exist(get_excel_save_path(dataset_name=graph_name, method1=emb_method_name2, method2=emb_method_name1)):# 有些xls文件的命名可能是两个method调换了