#step 3, find all the train edges #adj which is used for getting the samples d = np.zeros([N_d, N_d]) m = np.zeros([N_m, N_m]) adj111 = np.hstack((m, M_D)) adj222 = np.hstack((M_D.transpose(), d)) adj_MD = np.vstack((adj111, adj222)) label_matrix = adj_MD #k=47, 56,59,61,85,129,144,187,204,221,270,272,289,321,330 #pos_sample_n = sum(adj_MD[0:577,k+576]) #print("number of samples", pos_sample_n) adj_MD = sp.coo_matrix(adj_MD) edges_all, edges_pos, edges_false = mask_test_edges( adj_MD) #将数据分为新的adj矩阵(对称的),train边(不含对称的),vali边,vali 负边,test边,test负边, #print(edges_pos.shape) #np.save('edges_all.npy',edges_all) #np.save('edges_pos',edges_pos) #np.save('edges_false',edges_false) #edges_all = np.load('edges_all.npy') #edges_pos = np.load('edges_pos.npy') #edges_false = np.load('edges_false.npy') X_sample = np.vstack( (edges_pos, edges_false) ) #all the samples, inlculde all the positive samples and the same number of negative samples print("edges_pos", len(edges_pos), "edges_neg", len(edges_false)) print("samples numb in miRNA-disease associations part", len(find_mi_D(X_sample))) Y_sample = np.hstack((np.ones(len(edges_pos)), np.zeros(len(edges_false)))) ###########################################################################################################################################################
TRAIN_TEST_SPLITS_FOLDER = './train-test-splits/' # 遍历隐藏比例来划分数据集(训练集、验证集、测试集) for frac_hidden in FRAC_EDGES_HIDDEN: val_frac = 0.05 test_frac = frac_hidden - val_frac # 遍历每个网络进行划分 for g_name, graph_tuple in fb_graphs.items(): adj = graph_tuple[0] feat = graph_tuple[1] current_graph = 'fb-{}-{}-hidden'.format(g_name, frac_hidden) # 输入当前网络 print("Current graph: ", current_graph) np.random.seed(RANDOM_SEED) # 调用函数划分 train_test_split = mask_test_edges(adj, test_frac=test_frac, val_frac=val_frac, verbose=True) file_name = TRAIN_TEST_SPLITS_FOLDER + current_graph + '.pkl' # 保存划分数据 with open(file_name, 'wb') as f: pickle.dump(train_test_split, f, protocol=2)
def calculate_all_scores(adj_sparse, features_matrix=None, directed=False, \ test_frac=.3, val_frac=.1, random_state=0, verbose=1, \ train_test_split_file=None, tf_dtype=tf.float32): np.random.seed(random_state) # Guarantee consistent train/test split tf.set_random_seed(random_state) # Consistent GAE training # Prepare LP scores dictionary lp_scores = {} ### ---------- PREPROCESSING ---------- ### train_test_split = None try: # If found existing train-test split, use that file with open(train_test_split_file, 'rb') as f: train_test_split = pickle.load(f) print 'Found existing train-test split!' except: # Else, generate train-test split on the fly print 'Generating train-test split...' if directed == False: train_test_split = mask_test_edges(adj_sparse, test_frac=test_frac, val_frac=val_frac) else: train_test_split = mask_test_edges_directed(adj_sparse, test_frac=test_frac, val_frac=val_frac) adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack tuple # g_train: new graph object with only non-hidden edges if directed == True: g_train = nx.DiGraph(adj_train) else: g_train = nx.Graph(adj_train) # Inspect train/test split if verbose >= 1: print "Total nodes:", adj_sparse.shape[0] print "Total edges:", int(adj_sparse.nnz/2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges print "Training edges (positive):", len(train_edges) print "Training edges (negative):", len(train_edges_false) print "Validation edges (positive):", len(val_edges) print "Validation edges (negative):", len(val_edges_false) print "Test edges (positive):", len(test_edges) print "Test edges (negative):", len(test_edges_false) print '' print "------------------------------------------------------" ### ---------- LINK PREDICTION BASELINES ---------- ### # Adamic-Adar aa_scores = adamic_adar_scores(g_train, train_test_split) lp_scores['aa'] = aa_scores if verbose >= 1: print '' print 'Adamic-Adar Test ROC score: ', str(aa_scores['test_roc']) print 'Adamic-Adar Test AP score: ', str(aa_scores['test_ap']) # Jaccard Coefficient jc_scores = jaccard_coefficient_scores(g_train, train_test_split) lp_scores['jc'] = jc_scores if verbose >= 1: print '' print 'Jaccard Coefficient Test ROC score: ', str(jc_scores['test_roc']) print 'Jaccard Coefficient Test AP score: ', str(jc_scores['test_ap']) # Preferential Attachment pa_scores = preferential_attachment_scores(g_train, train_test_split) lp_scores['pa'] = pa_scores if verbose >= 1: print '' print 'Preferential Attachment Test ROC score: ', str(pa_scores['test_roc']) print 'Preferential Attachment Test AP score: ', str(pa_scores['test_ap']) ### ---------- SPECTRAL CLUSTERING ---------- ### sc_scores = spectral_clustering_scores(train_test_split) lp_scores['sc'] = sc_scores if verbose >= 1: print '' print 'Spectral Clustering Validation ROC score: ', str(sc_scores['val_roc']) print 'Spectral Clustering Validation AP score: ', str(sc_scores['val_ap']) print 'Spectral Clustering Test ROC score: ', str(sc_scores['test_roc']) print 'Spectral Clustering Test AP score: ', str(sc_scores['test_ap']) ## ---------- NODE2VEC ---------- ### # node2vec settings # NOTE: When p = q = 1, this is equivalent to DeepWalk P = 1 # Return hyperparameter Q = 1 # In-out hyperparameter WINDOW_SIZE = 10 # Context size for optimization NUM_WALKS = 10 # Number of walks per source WALK_LENGTH = 80 # Length of walk per source DIMENSIONS = 128 # Embedding dimension DIRECTED = False # Graph directed/undirected WORKERS = 8 # Num. parallel workers ITER = 1 # SGD epochs # Using bootstrapped edge embeddings + logistic regression n2v_edge_emb_scores = node2vec_scores(g_train, train_test_split, P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER, "edge-emb", verbose) lp_scores['n2v_edge_emb'] = n2v_edge_emb_scores if verbose >= 1: print '' print 'node2vec (Edge Embeddings) Validation ROC score: ', str(n2v_edge_emb_scores['val_roc']) print 'node2vec (Edge Embeddings) Validation AP score: ', str(n2v_edge_emb_scores['val_ap']) print 'node2vec (Edge Embeddings) Test ROC score: ', str(n2v_edge_emb_scores['test_roc']) print 'node2vec (Edge Embeddings) Test AP score: ', str(n2v_edge_emb_scores['test_ap']) # Using dot products to calculate edge scores n2v_dot_prod_scores = node2vec_scores(g_train, train_test_split, P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER, "dot-product", verbose) lp_scores['n2v_dot_prod'] = n2v_dot_prod_scores if verbose >= 1: print '' print 'node2vec (Dot Product) Validation ROC score: ', str(n2v_dot_prod_scores['val_roc']) print 'node2vec (Dot Product) Validation AP score: ', str(n2v_dot_prod_scores['val_ap']) print 'node2vec (Dot Product) Test ROC score: ', str(n2v_dot_prod_scores['test_roc']) print 'node2vec (Dot Product) Test AP score: ', str(n2v_dot_prod_scores['test_ap']) ### ---------- (VARIATIONAL) GRAPH AUTOENCODER ---------- ### # # GAE hyperparameters # LEARNING_RATE = 0.001 # Default: 0.01 # EPOCHS = 200 # HIDDEN1_DIM = 32 # HIDDEN2_DIM = 16 # DROPOUT = 0 # # Use dot product # tf.set_random_seed(random_state) # Consistent GAE training # gae_results = gae_scores(adj_sparse, train_test_split, features_matrix, # LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT, # "dot-product", # verbose, # dtype=tf.float16) # lp_scores['gae'] = gae_results # if verbose >= 1: # print '' # print 'GAE (Dot Product) Validation ROC score: ', str(gae_results['val_roc']) # print 'GAE (Dot Product) Validation AP score: ', str(gae_results['val_ap']) # print 'GAE (Dot Product) Test ROC score: ', str(gae_results['test_roc']) # print 'GAE (Dot Product) Test AP score: ', str(gae_results['test_ap']) # # Use edge embeddings # tf.set_random_seed(random_state) # Consistent GAE training # gae_edge_emb_results = gae_scores(adj_sparse, train_test_split, features_matrix, # LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT, # "edge-emb", # verbose) # lp_scores['gae_edge_emb'] = gae_edge_emb_results # if verbose >= 1: # print '' # print 'GAE (Edge Embeddings) Validation ROC score: ', str(gae_edge_emb_results['val_roc']) # print 'GAE (Edge Embeddings) Validation AP score: ', str(gae_edge_emb_results['val_ap']) # print 'GAE (Edge Embeddings) Test ROC score: ', str(gae_edge_emb_results['test_roc']) # print 'GAE (Edge Embeddings) Test AP score: ', str(gae_edge_emb_results['test_ap']) ### ---------- RETURN RESULTS ---------- ### return lp_scores
# draw network # nx.draw_networkx(Gr, with_labels=False, node_size=50, node_color='r') # plt.show() # ## 2. Preprocessing/Train-Test Split # In[ ]: from gae.preprocessing import mask_test_edges np.random.seed(0) # make sure train-test split is consistent between notebooks adj_sparse = nx.to_scipy_sparse_matrix(Gr) # In[ ]: # Perform train-test split adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj_sparse, test_frac=.2, val_frac=.1) g_train = nx.from_scipy_sparse_matrix( adj_train) # new graph object with only non-hidden edges # In[67]: # Inspect train/test split print "Total nodes:", adj_sparse.shape[0] print "Total edges:", int( adj_sparse.nnz / 2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges print "Training edges (positive):", len(train_edges) print "Training edges (negative):", len(train_edges_false) print "Validation edges (positive):", len(val_edges) print "Validation edges (negative):", len(val_edges_false) print "Test edges (positive):", len(test_edges)
flags.DEFINE_string('dataset', 'cora', 'Dataset string.') flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).') flags.DEFINE_string("checkpoint_dir", "checkpoints", "checkpoint directory") model_str = FLAGS.model dataset_str = FLAGS.dataset # Load data adj, features = load_data(dataset_str) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) }
for i in MasterNodes: if i not in G6.nodes(): G6.add_node(i) adj_sparse = nx.to_scipy_sparse_matrix(G6) # In[7]: from gae.preprocessing import mask_test_edges np.random.seed(0) # make sure train-test split is consistent between notebooks adj_sparse = nx.to_scipy_sparse_matrix(G6) adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=.3, val_frac=.0, prevent_disconnect = True) # In[6]: # Inspect train/test split print "Total nodes:", adj_sparse.shape[0] print "Total edges:", int(adj_sparse.nnz/2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges print "Training edges (positive):", len(train_edges) print "Training edges (negative):", len(train_edges_false) print "Validation edges (positive):", len(val_edges) print "Validation edges (negative):", len(val_edges_false) print "Test edges (positive):", len(test_edges) print "Test edges (negative):", len(test_edges_false)
g = nx.Graph(adj) # re-create graph using node indices (0 to num_nodes-1) # draw network nx.draw_networkx(g, with_labels=False, node_size=50, node_color='r') plt.show() ############################################# # 2. Preprocessing/Train-Test Split np.random.seed(0) # make sure train-test split is consistent between notebooks adj_sparse = nx.to_scipy_sparse_matrix(g) # Perform train-test split adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = mask_test_edges( adj_sparse, test_frac=.3, val_frac=.1) # new graph object with only non-hidden edges g_train = nx.from_scipy_sparse_matrix(adj_train) # Inspect train/test split print("Total nodes:", adj_sparse.shape[0]) # adj is symmetric, so nnz (num non-zero) = 2*num_edges print("Total edges:", int(adj_sparse.nnz / 2)) print("Training edges (positive):", len(train_edges)) print("Training edges (negative):", len(train_edges_false)) print("Validation edges (positive):", len(val_edges)) print("Validation edges (negative):", len(val_edges_false)) print("Test edges (positive):", len(test_edges)) print("Test edges (negative):", len(test_edges_false)) #############################################
def gae(filename, output_dir): # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.') flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.') flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.') flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).') flags.DEFINE_string('filename', 'email-Eu-core.mat', 'dataset') flags.DEFINE_string('model', 'gcn_vae', 'Model string.') flags.DEFINE_string('dataset', 'cora', 'Dataset string.') flags.DEFINE_integer('features', 0, 'Whether to use features (1) or not (0).') model_str = FLAGS.model # dataset_str = FLAGS.dataset # Load data # adj, features = load_data(dataset_str) adj, R, edges = load_network_data(filename) num_edges = np.sum(adj) length = adj.shape[0] A = np.array(adj, copy=True) adj = sp.csr_matrix(adj) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges = mask_test_edges(adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(adj.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss # avg_cost = outs[1] # avg_accuracy = outs[2] # # if (epoch + 1) % 10 == 0: # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), # "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) print("GAE Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) adj_rec = np.array(adj_rec) # adj_rec = adj_rec[1:length, :][:, 1:length] DD = np.sort(adj_rec.flatten()) threshold = DD[int(-1 * num_edges)] network_C = np.array([[ 0 if adj_rec[i, j] < threshold else 1 for i in range(adj_rec.shape[0]) ] for j in range(adj_rec.shape[1])], dtype=np.int8) # np.save('../data/GAE_network.npy', network_C[1:length, :][:, 1:length]) os.chdir('../') np.save('{}/GAE_network.npy'.format(output_dir, filename), network_C[1:length, :][:, 1:length]) A_copy = adj_rec final_network = [A_copy] # orinal_network = [A] for i in range(1, 5): adjacent_matrix = tf.placeholder(tf.float32, shape=A_copy.shape) R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) A_copy = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), feed_dict={ R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A_copy }) final_network.append(np.array(A_copy)) # adjacent_matrix = tf.placeholder(tf.float32, shape=A.shape) # R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) # A = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), # feed_dict={R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A}) # orinal_network.append(A) # draw_graph(final_network, edges, output_dir) network_B = final_network[0] print('Generating graph by GAE algorithm.') DD = np.sort(network_B.flatten())[::-1] threshold = DD[edges[0, 0]] network_C = np.array([[ 0 if network_B[i, j] < threshold else 1 for i in range(network_B.shape[0]) ] for j in range(network_B.shape[1])]) _A_obs = network_C + network_C.T _A_obs[_A_obs > 1] = 1 _A_obs = np.array(_A_obs) print('Computing metrics for graph generated by GAE') c = compute_graph_statistics(_A_obs) with open('{}/gae_network_statistics.pickle'.format(output_dir), 'wb') as handle: pickle.dump(c, handle, protocol=pickle.HIGHEST_PROTOCOL) print(c)
def __init__(self, graph_edgelist, num_actions, dimension, learning_rate=0.01, epochs=300, hidden1=32, hidden2=16, dropout=0., model_str='gcn_vae', use_features=0): """Initialize ExactBasis.""" if graph_edgelist is None: raise ValueError('graph cannot be None') if dimension < 1: raise ValueError('dimension must be >= 1') self.__num_actions = BasisFunction._validate_num_actions(num_actions) self._dimension = dimension adj, features = self.read_graph(graph_edgelist) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # adj = adj_train if use_features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, hidden1, hidden2, dimension) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, hidden1, dimension) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("GCN Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) self.embeddings = sess.run(model.z_mean, feed_dict=feed_dict)
adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() degrees = [] for i in range(adj.shape[0]): r = adj.getrow(i).toarray().flatten() nz = np.nonzero(r) degrees.append(len(nz[0])) deg_matrix = sp.diags([degrees], [0]) logging.debug('Some preprocessing done') np.random.seed(0) # IMPORTANT: guarantees consistent train/test splits adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = mask_test_edges(adj, test_frac=.3, val_frac=.1, verbose=True) logging.debug('Splitting done') # Normalize adjacency matrix # adj_norm = normalize(adj_train, axis=1) adj_norm = deg_matrix.tocsr() * adj_train.tocsr() * deg_matrix.tocsr() logging.debug('Preprocessed graph') # Add in diagonals adj_label_mat = adj_orig + sp.eye(adj_orig.shape[0]) adj_label = sparse_to_tuple(adj_label_mat) # Inspect train/test split print("Total nodes:", adj.shape[0]) print("Total edges:", int(adj.nnz/2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
flags.DEFINE_string('model', 'gcn_ae', 'Model string.') flags.DEFINE_string('dataset', 'cora', 'Dataset string.') flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).') model_str = FLAGS.model dataset_str = FLAGS.dataset # Load data adj, features = load_data(dataset_str) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) }
def gcn_multilayer(self): """Neural embedding of a multilayer network""" all_nodes = self.get_all_nodes() tmp_fname = pjoin(self.out_dir, 'tmp.emb') for net_name, net in self.nets.items(): self.log.info('Run GCN For Net: %s' % net_name) # ============================================================= adjacency_matrix = nx.adjacency_matrix(net) adjacency_matrix = adjacency_matrix.todense() nodes_count = adjacency_matrix.shape[0] adj = adjacency_matrix features = sp.identity(nodes_count) adj = sp.csr_matrix(adj) # ----------------myCode----------------------------------- # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # tst_actual_matrix = adj.toarray() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train # -----------------------------myCode------------------------- # if FLAGS.features == 0: # features = sp.identity(features.shape[0]) # featureless # -----------------------------myCode------------------------- # Some pre processing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if self.model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, self.hidden1, self.hidden2) elif self.model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, self.hidden1, self.hidden2) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if self.model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) cost_val = [] acc_val = [] def get_roc_score(edges_pos, edges_neg, emb=None): if emb is None: feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) preds = [] pos = [] for e in edges_pos: preds.append(sigmoid(adj_rec[e[0], e[1]])) pos.append(adj_orig[e[0], e[1]]) preds_neg = [] neg = [] for e in edges_neg: preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) neg.append(adj_orig[e[0], e[1]]) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score cost_val = [] acc_val = [] val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model # for epoch in range(FLAGS.epochs): # epochs = 10 dropout = 0 for epoch in range(self.n_iter): self.log.info('Iteration: %d' % epoch) t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) # feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # -----------myCode------------ feed_dict.update({placeholders['dropout']: dropout}) # -----------myCode------------ # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) # ------vector generation ----------------------------- vectors = sess.run(model.embeddings, feed_dict=feed_dict) fname = self.out_dir + net_name +'vectors.txt' # with open(fname, 'a+') as fout: # for line in np.array(vectors): # fout.write(line + "\n") np.savetxt(fname, np.array(vectors), fmt="%s", delimiter=' ') self.log.info('Saving vectors: %s' % fname) # ============================================================== self.log.info('after exec gcn : %s' % net_name) self.log.info('Done!')
# TODO = ['fb-combined-0.75-hidden'] # Iterate over fractions of edges to hide for frac_hidden in FRAC_EDGES_HIDDEN: val_frac = 0.1 test_frac = frac_hidden - val_frac # Iterate over each graph for g_name, graph_tuple in fb_graphs.iteritems(): adj = graph_tuple[0] feat = graph_tuple[1] current_graph = 'fb-{}-{}-hidden'.format(g_name, frac_hidden) # if current_graph in TODO: print "Current graph: ", current_graph np.random.seed(RANDOM_SEED) # Run all link prediction methods on current graph, store results train_test_split = mask_test_edges(adj, test_frac=test_frac, val_frac=val_frac, verbose=True) file_name = TRAIN_TEST_SPLITS_FOLDER + current_graph + '.pkl' # Save split with open(file_name, 'wb') as f: pickle.dump(train_test_split, f, protocol=2)
print(f"adj type, {type(adj)}") print(f"adj.shape, {adj.shape}") print(f"adj[:10,:10], {adj[:10, :10]}") print(f"adj {adj}") print(f"feature.shape, {features.shape}") # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) if DEBUG: print(f"ad_orig type, {type(adj_orig)}") print(f"adj_orig.shape, {adj_orig.shape}") adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj, random_seed=42) if DEBUG: print(f"adj_train type, {type(adj_train)}") print(f"adj_train shape, {type(adj_train.shape)}") print(f"train_edges type, {type(train_edges)}") print('*' * 20) print(f"train_edges shape, {train_edges.shape}") print(f"val_edges shape, {val_edges.shape}") print(f"test_edges shape, {test_edges.shape}") print('*' * 20) print(f"val_edges_false type, {type(val_edges_false)}") print(f"test_edges_false type, {type(test_edges_false)}") print(f"len val edges false, {len(val_edges_false)}") print(f"len test edges false, {len(test_edges_false)}") print('*' * 20) print(f"val_edges[:20], {val_edges[:20]}")
def calculate_all_scores(adj_sparse, features_matrix=None, directed=False, \ test_frac=.1, val_frac=.05, random_state=0, verbose=1, \ train_test_split_file=None, tf_dtype=tf.float32): np.random.seed(random_state) # Guarantee consistent train/test split tf.set_random_seed(random_state) # Consistent GAE training # 链路预测得分字典 lp_scores = {} ### ---------- 预处理 ---------- ### train_test_split = None try: # 如果找到存在的划分好的数据集,则使用找到的文件 with open(train_test_split_file, 'rb') as f: train_test_split = pickle.load(f) print('Found existing train-test split!') except: # 否则, 生成数据划分集 print('Generating train-test split...') if directed == False: train_test_split = mask_test_edges(adj_sparse, test_frac=test_frac, val_frac=val_frac) else: train_test_split = mask_test_edges_directed(adj_sparse, test_frac=test_frac, val_frac=val_frac) adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # 打开元组 # g_train: 完整的图对象(没有隐藏边) if directed == True: g_train = nx.DiGraph(adj_train) else: g_train = nx.Graph(adj_train) # 检查训练集测试集划分 if verbose >= 1: print("Total nodes:", adj_sparse.shape[0]) print("Total edges:", int(adj_sparse.nnz/2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges print("Training edges (positive):", len(train_edges)) print("Training edges (negative):", len(train_edges_false)) print("Validation edges (positive):", len(val_edges)) print("Validation edges (negative):", len(val_edges_false)) print("Test edges (positive):", len(test_edges)) print("Test edges (negative):", len(test_edges_false)) print('') print("------------------------------------------------------") # ---------- 链路预测基线方法---------- ### # # Adamic-Adar aa_scores = adamic_adar_scores(g_train, train_test_split) lp_scores['aa'] = aa_scores if verbose >= 1: print('') print('Adamic-Adar Test ROC score: ', str(aa_scores['test_roc'])) print('Adamic-Adar Test AP score: ', str(aa_scores['test_ap'])) # Jaccard Coefficient jc_scores = jaccard_coefficient_scores(g_train, train_test_split) lp_scores['jc'] = jc_scores if verbose >= 1: print('') print('Jaccard Coefficient Test ROC score: ', str(jc_scores['test_roc'])) print('Jaccard Coefficient Test AP score: ', str(jc_scores['test_ap'])) # Preferential Attachment pa_scores = preferential_attachment_scores(g_train, train_test_split) lp_scores['pa'] = pa_scores if verbose >= 1: print('') print('Preferential Attachment Test ROC score: ', str(pa_scores['test_roc'])) print('Preferential Attachment Test AP score: ', str(pa_scores['test_ap'])) ### ---------- SPECTRAL CLUSTERING ---------- ### sc_scores = spectral_clustering_scores(train_test_split) lp_scores['sc'] = sc_scores if verbose >= 1: print('') print('Spectral Clustering Validation ROC score: ', str(sc_scores['val_roc'])) print('Spectral Clustering Validation AP score: ', str(sc_scores['val_ap'])) print('Spectral Clustering Test ROC score: ', str(sc_scores['test_roc'])) print('Spectral Clustering Test AP score: ', str(sc_scores['test_ap'])) print('') ## ---------- NODE2VEC ---------- ### # node2vec 参数设置 # 当 p = q = 1, Node2Vec等同于DeepWalk P = 1 # 返回概率参数 Q = 1 # 进出概率参数 WINDOW_SIZE = 10 # 优化的上下文大小 NUM_WALKS = 10 # 每次源的游走次数 WALK_LENGTH = 80 # 每次源的游走序列长度 DIMENSIONS = 128 # 嵌入维度 DIRECTED = False # 有向/无向图 WORKERS = 8 # 平行游者的数量 ITER = 1 # SGD 迭代次数 # 使用自举边嵌入+逻辑回归 n2v_edge_emb_scores = node2vec_scores(g_train, train_test_split, P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER, "edge-emb", verbose) lp_scores['n2v_edge_emb'] = n2v_edge_emb_scores if verbose >= 1: print('') print('node2vec (Edge Embeddings) Validation ROC score: ', str(n2v_edge_emb_scores['val_roc'])) print('node2vec (Edge Embeddings) Validation AP score: ', str(n2v_edge_emb_scores['val_ap'])) print('node2vec (Edge Embeddings) Test ROC score: ', str(n2v_edge_emb_scores['test_roc'])) print('node2vec (Edge Embeddings) Test AP score: ', str(n2v_edge_emb_scores['test_ap'])) print('') # 使用点积计算边得分 n2v_dot_prod_scores = node2vec_scores(g_train, train_test_split, P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER, "dot-product", verbose) lp_scores['n2v_dot_prod'] = n2v_dot_prod_scores if verbose >= 1: print('') print('node2vec (Dot Product) Validation ROC score: ', str(n2v_dot_prod_scores['val_roc'])) print('node2vec (Dot Product) Validation AP score: ', str(n2v_dot_prod_scores['val_ap'])) print('node2vec (Dot Product) Test ROC score: ', str(n2v_dot_prod_scores['test_roc'])) print('node2vec (Dot Product) Test AP score: ', str(n2v_dot_prod_scores['test_ap'])) print('') ### ---------- (VARIATIONAL) GRAPH AUTOENCODER ---------- ### # # GAE 参数设置 LEARNING_RATE = 0.01 # Default: 0.01 EPOCHS = 250 HIDDEN1_DIM = 32 HIDDEN2_DIM = 16 DROPOUT = 0 # 使用点积 tf.set_random_seed(random_state) # Consistent GAE training gae_results = gae_scores(adj_sparse, train_test_split, features_matrix, LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT, "dot-product", verbose, dtype=tf.float32) lp_scores['gae'] = gae_results if verbose >= 1: print('') print('GAE (Dot Product) Validation ROC score: ', str(gae_results['val_roc'])) print('GAE (Dot Product) Validation AP score: ', str(gae_results['val_ap'])) print('GAE (Dot Product) Test ROC score: ', str(gae_results['test_roc'])) print('GAE (Dot Product) Test AP score: ', str(gae_results['test_ap'])) print("------------------------------------------------------") print("------------------------------------------------------") print('') # 使用边嵌入 tf.set_random_seed(random_state) # Consistent GAE training gae_edge_emb_results = gae_scores(adj_sparse, train_test_split, features_matrix, LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT, "edge-emb", verbose) lp_scores['gae_edge_emb'] = gae_edge_emb_results if verbose >= 1: print('') print('GAE (Edge Embeddings) Validation ROC score: ', str(gae_edge_emb_results['val_roc'])) print('GAE (Edge Embeddings) Validation AP score: ', str(gae_edge_emb_results['val_ap'])) #print('GAE (Edge Embeddings) Validation ROC_CURVE score: ', str(gae_edge_emb_results['val_roc_curve'])) print('GAE (Edge Embeddings) Test ROC score: ', str(gae_edge_emb_results['test_roc'])) print('GAE (Edge Embeddings) Test AP score: ', str(gae_edge_emb_results['test_ap'])) #print('GAE (Edge Embeddings) Test ROC_CURVE score: ', str(gae_edge_emb_results['test_roc_curve'])) ### ---------- 返回结果 ---------- ### return lp_scores
def main(filename): Gr = nx.read_edgelist(filename, nodetype=int, delimiter=",") for edge in Gr.edges(): Gr[edge[0]][edge[1]]['weight'] = 1 # In[92]: # In[ ]: #print Gr.number_of_edges() #print Gr.number_of_nodes() # In[52]: # draw network # nx.draw_networkx(Gr, with_labels=False, node_size=50, node_color='r') # plt.show() # ## 2. Preprocessing/Train-Test Split # In[ ]: from gae.preprocessing import mask_test_edges np.random.seed(0) # make sure train-test split is consistent between notebooks adj_sparse = nx.to_scipy_sparse_matrix(Gr) # In[ ]: # Perform train-test split if not filename=='m4.csv': # defineer laatste maand hier adj_train, train_edges, train_edges_false, val_edges_x, val_edges_false_x, test_edges_x, test_edges_false_x = mask_test_edges(adj_sparse, test_frac=0, val_frac=0) g_train = nx.from_scipy_sparse_matrix(adj_train) # new graph object with only non-hidden edges else: Gr = nx.read_edgelist(filename, nodetype=int, delimiter=",") for edge in Gr.edges(): Gr[edge[0]][edge[1]]['weight'] = 1 adj_train, train_edges_x, train_edges_false_x, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=0.67, val_frac=0.33) g_train = nx.from_scipy_sparse_matrix(adj_train) # In[67]: # Inspect train/test split #print "Total nodes:", adj_sparse.shape[0] #print "Total edges:", int(adj_sparse.nnz/2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges #print "Training edges (positive):", len(train_edges) #print "Training edges (negative):", len(train_edges_false) #print "Validation edges (positive):", len(val_edges) #print "Validation edges (negative):", len(val_edges_false) #print "Test edges (positive):", len(test_edges) #print "Test edges (negative):", len(test_edges_false) ''' output_training_test = open("output_split.txt", "w") output_training_test.write(("\n Total nodes:" + str(adj_sparse.shape[0]))) output_training_test.write("\n Total edges:" + str(int(adj_sparse.nnz/2))) output_training_test.write("\n Training edges (positive): " + str(len(train_edges))) output_training_test.write("\n Training edges (negative): " + str(len(train_edges_false))) output_training_test.write("\n Validation edges (positive): " + str(len(val_edges))) output_training_test.write("\n Validation edges (negative): " + str(len(val_edges_false))) output_training_test.write("\n Test edges (positive): " + str(len(test_edges))) output_training_test.write("\n Test edges (negative): " + str(len(test_edges_false))) output_training_test.close() ''' # 3. Train node2vec (Learn Node Embeddings) # In[68]: import node2vec from gensim.models import Word2Vec # In[69]: # node2vec settings # NOTE: When p = q = 1, this is equivalent to DeepWalk P = 1 # Return hyperparameter Q = 1 # In-out hyperparameter WINDOW_SIZE = 10 # Context size for optimization NUM_WALKS = 10 # Number of walks per source WALK_LENGTH = 80 # Length of walk per source DIMENSIONS = 128 # Embedding dimension DIRECTED = False # Graph directed/undirected WORKERS = 8 # Num. parallel workers ITER = 1 # SGD epochs # In[70]: # Preprocessing, generate walks g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q) # create node2vec graph instance g_n2v.preprocess_transition_probs() walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH) walks = [map(str, walk) for walk in walks] # Train skip-gram model model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER) # Store embeddings mapping emb_mappings = model.wv # ## 4. Create Edge Embeddings # In[71]: # Create node embeddings matrix (rows = nodes, columns = embedding features) emb_list = [] for node_index in range(0, adj_sparse.shape[0]): node_str = str(node_index) node_emb = emb_mappings[node_str] emb_list.append(node_emb) emb_matrix = np.vstack(emb_list) # In[72]: # Generate bootstrapped edge embeddings (as is done in node2vec paper) # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2 def get_edge_embeddings(edge_list): embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) embs.append(edge_emb) embs = np.array(embs) return embs # In[73]: ''' # Train-set edge embeddings pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) '''# # Val-set edge embeddings, labels if filename=='m4.csv': pos_val_edge_embs = get_edge_embeddings(val_edges) neg_val_edge_embs = get_edge_embeddings(val_edges_false) val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # # Test-set edge embeddings, labels pos_test_edge_embs = get_edge_embeddings(test_edges) neg_test_edge_embs = get_edge_embeddings(test_edges_false) test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) return 69,69,val_edge_embs, val_edge_labels, test_edge_embs, test_edge_labels else: pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) return train_edge_embs, train_edge_labels, 69, 69, 69, 69