def gae_scores( adj_sparse, train_test_split, features_matrix=None, LEARNING_RATE = 0.01, EPOCHS = 200, HIDDEN1_DIM = 32, HIDDEN2_DIM = 16, DROPOUT = 0, edge_score_mode="dot-product", verbose=1, dtype=tf.float32 ): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack train-test split if verbose >= 1: print 'GAE preprocessing...' start_time = time.time() # Train on CPU (hide GPU) due to memory constraints os.environ['CUDA_VISIBLE_DEVICES'] = "" # Convert features from normal matrix --> sparse matrix --> tuple # features_tuple contains: (list of matrix coordinates, list of values, matrix dimensions) if features_matrix is None: x = sp.lil_matrix(np.identity(adj_sparse.shape[0])) else: x = sp.lil_matrix(features_matrix) features_tuple = sparse_to_tuple(x) features_shape = features_tuple[2] # Get graph attributes (to feed into model) num_nodes = adj_sparse.shape[0] # number of nodes in adjacency matrix num_features = features_shape[1] # number of features (columsn of features matrix) features_nonzero = features_tuple[1].shape[0] # number of non-zero entries in features matrix (or length of values list) # Store original adjacency matrix (without diagonal entries) for later adj_orig = deepcopy(adj_sparse) adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # Normalize adjacency matrix adj_norm = preprocess_graph(adj_train) # Add in diagonals adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Define placeholders placeholders = { # TODO: try making these dense from the get-go 'features': tf.sparse_placeholder(tf.float16), 'adj': tf.sparse_placeholder(tf.float16), 'adj_orig': tf.sparse_placeholder(tf.float16), 'dropout': tf.placeholder_with_default(0., shape=()) } # How much to weigh positive examples (true edges) in cost print_function # Want to weigh less-frequent classes higher, so as to prevent model output bias # pos_weight = (num. negative samples / (num. positive samples) pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum() # normalize (scale) average weighted cost norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float((adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2) if verbose >= 1: print 'Initializing GAE model...' # Create VAE model model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False) opt = OptimizerVAE(preds=model.reconstructions, labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), # labels=placeholders['adj_orig'], model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=LEARNING_RATE, dtype=tf.float16) cost_val = [] acc_val = [] val_roc_score = [] prev_embs = [] # Initialize session sess = tf.Session() if verbose >= 1: # Print total # trainable variables total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() print "Variable shape: ", shape variable_parameters = 1 for dim in shape: print "Current dimension: ", dim variable_parameters *= dim.value print "Variable params: ", variable_parameters total_parameters += variable_parameters print '' print "TOTAL TRAINABLE PARAMS: ", total_parameters print 'Initializing TF variables...' sess.run(tf.global_variables_initializer()) if verbose >= 1: print 'Starting GAE training!' # Train model for epoch in range(EPOCHS): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders) feed_dict.update({placeholders['dropout']: DROPOUT}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] # Evaluate predictions feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) prev_embs.append(gae_emb) gae_score_matrix = np.dot(gae_emb, gae_emb.T) # # TODO: remove this (debugging) # if not np.isfinite(gae_score_matrix).all(): # print 'Found non-finite value in GAE score matrix! Epoch: {}'.format(epoch) # with open('numpy-nan-debugging.pkl', 'wb') as f: # dump_info = {} # dump_info['gae_emb'] = gae_emb # dump_info['epoch'] = epoch # dump_info['gae_score_matrix'] = gae_score_matrix # dump_info['adj_norm'] = adj_norm # dump_info['adj_label'] = adj_label # dump_info['features_tuple'] = features_tuple # # dump_info['feed_dict'] = feed_dict # dump_info['prev_embs'] = prev_embs # pickle.dump(dump_info, f, protocol=2) # # END TODO roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True) val_roc_score.append(roc_curr) # Print results for this epoch if verbose == 2: print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) if verbose == 2: print("Optimization Finished!") # Print final results feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) # Dot product edge scores (default) if edge_score_mode == "dot-product": gae_score_matrix = np.dot(gae_emb, gae_emb.T) runtime = time.time() - start_time # Calculate final scores gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix) gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix) # Take bootstrapped edge embeddings (via hadamard product) elif edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list): embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = gae_emb[node1] emb2 = gae_emb[node2] edge_emb = np.multiply(emb1, emb2) embs.append(edge_emb) embs = np.array(embs) return embs # Train-set edge embeddings pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) # Val-set edge embeddings, labels if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs = get_edge_embeddings(val_edges) neg_val_edge_embs = get_edge_embeddings(val_edges_false) val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # Test-set edge embeddings, labels pos_test_edge_embs = get_edge_embeddings(test_edges) neg_test_edge_embs = get_edge_embeddings(test_edges_false) test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) # Train logistic regression classifier on train-set edge embeddings edge_classifier = LogisticRegression(random_state=0) edge_classifier.fit(train_edge_embs, train_edge_labels) # Predicted edge scores: probability of being of class "1" (real edge) if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] runtime = time.time() - start_time # Calculate scores if len(val_edges) > 0 and len(val_edges_false) > 0: gae_val_roc = roc_auc_score(val_edge_labels, val_preds) # gae_val_roc_curve = roc_curve(val_edge_labels, val_preds) gae_val_ap = average_precision_score(val_edge_labels, val_preds) else: gae_val_roc = None gae_val_roc_curve = None gae_val_ap = None gae_test_roc = roc_auc_score(test_edge_labels, test_preds) # gae_test_roc_curve = roc_curve(test_edge_labels, test_preds) gae_test_ap = average_precision_score(test_edge_labels, test_preds) # Record scores gae_scores = {} gae_scores['test_roc'] = gae_test_roc # gae_scores['test_roc_curve'] = gae_test_roc_curve gae_scores['test_ap'] = gae_test_ap gae_scores['val_roc'] = gae_val_roc # gae_scores['val_roc_curve'] = gae_val_roc_curve gae_scores['val_ap'] = gae_val_ap gae_scores['val_roc_per_epoch'] = val_roc_score gae_scores['runtime'] = runtime return gae_scores
model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() #负样本与正样本的比例 print('adj.shape', adj.shape[0], adj.sum()) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # shape[0] = 913, adj.sum() = 10734 # Optimizer opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time()
def gae(filename, output_dir): # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.') flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.') flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.') flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).') flags.DEFINE_string('filename', 'email-Eu-core.mat', 'dataset') flags.DEFINE_string('model', 'gcn_vae', 'Model string.') flags.DEFINE_string('dataset', 'cora', 'Dataset string.') flags.DEFINE_integer('features', 0, 'Whether to use features (1) or not (0).') model_str = FLAGS.model # dataset_str = FLAGS.dataset # Load data # adj, features = load_data(dataset_str) adj, R, edges = load_network_data(filename) num_edges = np.sum(adj) length = adj.shape[0] A = np.array(adj, copy=True) adj = sp.csr_matrix(adj) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges = mask_test_edges(adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(adj.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss # avg_cost = outs[1] # avg_accuracy = outs[2] # # if (epoch + 1) % 10 == 0: # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), # "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) print("GAE Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) adj_rec = np.array(adj_rec) # adj_rec = adj_rec[1:length, :][:, 1:length] DD = np.sort(adj_rec.flatten()) threshold = DD[int(-1 * num_edges)] network_C = np.array([[ 0 if adj_rec[i, j] < threshold else 1 for i in range(adj_rec.shape[0]) ] for j in range(adj_rec.shape[1])], dtype=np.int8) # np.save('../data/GAE_network.npy', network_C[1:length, :][:, 1:length]) os.chdir('../') np.save('{}/GAE_network.npy'.format(output_dir, filename), network_C[1:length, :][:, 1:length]) A_copy = adj_rec final_network = [A_copy] # orinal_network = [A] for i in range(1, 5): adjacent_matrix = tf.placeholder(tf.float32, shape=A_copy.shape) R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) A_copy = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), feed_dict={ R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A_copy }) final_network.append(np.array(A_copy)) # adjacent_matrix = tf.placeholder(tf.float32, shape=A.shape) # R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) # A = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), # feed_dict={R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A}) # orinal_network.append(A) # draw_graph(final_network, edges, output_dir) network_B = final_network[0] print('Generating graph by GAE algorithm.') DD = np.sort(network_B.flatten())[::-1] threshold = DD[edges[0, 0]] network_C = np.array([[ 0 if network_B[i, j] < threshold else 1 for i in range(network_B.shape[0]) ] for j in range(network_B.shape[1])]) _A_obs = network_C + network_C.T _A_obs[_A_obs > 1] = 1 _A_obs = np.array(_A_obs) print('Computing metrics for graph generated by GAE') c = compute_graph_statistics(_A_obs) with open('{}/gae_network_statistics.pickle'.format(output_dir), 'wb') as handle: pickle.dump(c, handle, protocol=pickle.HIGHEST_PROTOCOL) print(c)
def __init__(self, graph_edgelist, num_actions, dimension, learning_rate=0.01, epochs=300, hidden1=32, hidden2=16, dropout=0., model_str='gcn_vae', use_features=0): """Initialize ExactBasis.""" if graph_edgelist is None: raise ValueError('graph cannot be None') if dimension < 1: raise ValueError('dimension must be >= 1') self.__num_actions = BasisFunction._validate_num_actions(num_actions) self._dimension = dimension adj, features = self.read_graph(graph_edgelist) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # adj = adj_train if use_features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, hidden1, hidden2, dimension) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, hidden1, dimension) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("GCN Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) self.embeddings = sess.run(model.z_mean, feed_dict=feed_dict)
def gcn_multilayer(self): """Neural embedding of a multilayer network""" all_nodes = self.get_all_nodes() tmp_fname = pjoin(self.out_dir, 'tmp.emb') for net_name, net in self.nets.items(): self.log.info('Run GCN For Net: %s' % net_name) # ============================================================= adjacency_matrix = nx.adjacency_matrix(net) adjacency_matrix = adjacency_matrix.todense() nodes_count = adjacency_matrix.shape[0] adj = adjacency_matrix features = sp.identity(nodes_count) adj = sp.csr_matrix(adj) # ----------------myCode----------------------------------- # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # tst_actual_matrix = adj.toarray() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train # -----------------------------myCode------------------------- # if FLAGS.features == 0: # features = sp.identity(features.shape[0]) # featureless # -----------------------------myCode------------------------- # Some pre processing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if self.model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, self.hidden1, self.hidden2) elif self.model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, self.hidden1, self.hidden2) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if self.model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) cost_val = [] acc_val = [] def get_roc_score(edges_pos, edges_neg, emb=None): if emb is None: feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) preds = [] pos = [] for e in edges_pos: preds.append(sigmoid(adj_rec[e[0], e[1]])) pos.append(adj_orig[e[0], e[1]]) preds_neg = [] neg = [] for e in edges_neg: preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) neg.append(adj_orig[e[0], e[1]]) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score cost_val = [] acc_val = [] val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model # for epoch in range(FLAGS.epochs): # epochs = 10 dropout = 0 for epoch in range(self.n_iter): self.log.info('Iteration: %d' % epoch) t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) # feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # -----------myCode------------ feed_dict.update({placeholders['dropout']: dropout}) # -----------myCode------------ # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) # ------vector generation ----------------------------- vectors = sess.run(model.embeddings, feed_dict=feed_dict) fname = self.out_dir + net_name +'vectors.txt' # with open(fname, 'a+') as fout: # for line in np.array(vectors): # fout.write(line + "\n") np.savetxt(fname, np.array(vectors), fmt="%s", delimiter=' ') self.log.info('Saving vectors: %s' % fname) # ============================================================== self.log.info('after exec gcn : %s' % net_name) self.log.info('Done!')
def run(self): if self.file_expr == '': # text-image-code combination n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_combo( self.labels_dict) else: n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_data( self.labels_dict, self.file_expr, min_valid_triples=self.min_valid_triples, sep=self.file_sep, select_rels=self.select_rels) self.idx_supernodes = idx_supernodes adj = nx.adjacency_matrix(nx.from_scipy_sparse_matrix( n_by_n)) #nx.adjacency_matrix(nx.from_numpy_array(n_by_n)) features = scipy.sparse.csr.csr_matrix(x_train) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() self.adj_orig = adj_orig adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges2( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] if not self.use_features: features = sp.identity(features.shape[0]) # featureless features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model if model_str == 'gcn_ae': self.model = GCNModelAE(self.placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': self.model = GCNModelVAE(self.placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), model=self.model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) #import datetime #log_dir="logs/gae/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Train model for epoch in range(self.epochs): #FLAGS.epochs): t = time.time() # Construct feed dictionary self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) self.feed_dict.update( {self.placeholders['dropout']: self.dropout_rate}) # FLAGS.dropout}) # Run single weight update outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = self.get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = self.get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) [supernodes, supernodes_embeddings, supernodes_labels] = self.get_embeddings(y_train, label_encoder) self.supernodes = [ supernodes, supernodes_embeddings, supernodes_labels ]
def main(args): """ Compute embeddings using GAE/VGAE. """ # Load edgelist oneIndx = False E = np.loadtxt(args.inputgraph, delimiter=args.delimiter, dtype=int) if np.min(E) == 1: oneIndx = True E -= 1 # Create an unweighted graph G = nx.Graph() G.add_edges_from(E[:, :2]) # Get adj matrix of the graph tr_A = nx.adjacency_matrix(G, weight=None) num_nodes = tr_A.shape[0] # Set main diag to 1s and normalize (algorithm requirement) adj_norm = preprocess_graph(tr_A) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # Create empty feature matrix features = sp.identity(num_nodes) # featureless features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if args.model == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif args.model == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) / tr_A.sum() norm = tr_A.shape[0] * tr_A.shape[0] / float( (tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if args.model == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif args.model == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = tr_A + sp.eye(tr_A.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2])) # Compute predictions feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Node similarities adj_rec = np.dot(emb, emb.T) start = time.time() # Read the train edges and compute similarity if args.tr_e is not None: train_edges = np.loadtxt(args.tr_e, delimiter=args.delimiter, dtype=int) if oneIndx: train_edges -= 1 scores = list() for src, dst in train_edges: scores.append(sigmoid(adj_rec[src, dst])) np.savetxt(args.tr_pred, scores, delimiter=args.delimiter) # Read the test edges and run predictions if args.te_e is not None: test_edges = np.loadtxt(args.te_e, delimiter=args.delimiter, dtype=int) if oneIndx: test_edges -= 1 scores = list() for src, dst in test_edges: scores.append(sigmoid(adj_rec[src, dst])) np.savetxt(args.te_pred, scores, delimiter=args.delimiter) # If no edge lists provided to predict links, then just store the embeddings else: np.savetxt(args.output, emb, delimiter=args.delimiter) print('Prediction time: {}'.format(time.time() - start))
def fit(self, adj, features, labels): adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] features = normalize_vectors(features) # Define placeholders self.placeholders = { 'features': tf.compat.v1.placeholder(tf.float32, shape=(None, input_feature_dim)), # 'features': tf.compat.v1.sparse_placeholder(tf.float32), 'adj': tf.compat.v1.sparse_placeholder(tf.float32), 'adj_orig': tf.compat.v1.sparse_placeholder(tf.float32), 'dropout': tf.compat.v1.placeholder_with_default(0., shape=()) } if self.model_type == 'gcn_ae': self.model = GCNModelAE(self.placeholders, input_feature_dim) elif self.model_type == 'gcn_vae': self.model = GCNModelVAE(self.placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges # print('positive edge weight', pos_weight) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # Optimizer with tf.compat.v1.name_scope('optimizer'): if self.model_type == 'gcn_ae': opt = OptimizerAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse.to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_type == 'gcn_vae': opt = OptimizerVAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse.to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), model=self.model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session self.sess = tf.compat.v1.Session() self.sess.run(tf.compat.v1.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) self.feed_dict.update( {self.placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2]
def gae_scores( adj_sparse, train_test_split, features_matrix=None, LEARNING_RATE = 0.01, EPOCHS = 250, HIDDEN1_DIM = 32, HIDDEN2_DIM = 16, DROPOUT = 0, edge_score_mode="dot-product", verbose=1, dtype=tf.float32 ): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack train-test split if verbose >= 1: print('GAE preprocessing...') # start_time = time.time() # 由于内存限制,使用CPU (隐藏 GPU)训练 os.environ['CUDA_VISIBLE_DEVICES'] = "" # 特征转换 正常矩阵 --> 稀疏矩阵 --> 元组 # 特征元组包含: (矩阵坐标列表, 矩阵值列表, 矩阵维度) if features_matrix is None: x = sp.lil_matrix(np.identity(adj_sparse.shape[0])) else: x = sp.lil_matrix(features_matrix) features_tuple = sparse_to_tuple(x) features_shape = features_tuple[2] # 获取图属性 (用于输入模型) num_nodes = adj_sparse.shape[0] # 邻接矩阵的节点数量 num_features = features_shape[1] # 特征数量 (特征矩阵的列数) features_nonzero = features_tuple[1].shape[0] # 特征矩阵中的非零条目数(或者矩阵值列表长度) # 保存原始邻接矩阵 (没有对角线条目) 到后面使用 adj_orig = deepcopy(adj_sparse) adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # 归一化邻接矩阵 adj_norm = preprocess_graph(adj_train) # 添加对角线 adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # 定义占位符 placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # How much to weigh positive examples (true edges) in cost print_function # Want to weigh less-frequent classes higher, so as to prevent model output bias # pos_weight = (num. negative samples / (num. positive samples) pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum() # normalize (scale) average weighted cost norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float((adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2) if verbose >= 1: print('Initializing GAE model...') # 创建 VAE 模型 model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False) opt = OptimizerVAE(preds=model.reconstructions, labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), # labels=placeholders['adj_orig'], model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=LEARNING_RATE, dtype=tf.float32) cost_val = [] acc_val = [] val_roc_score = [] prev_embs = [] # 初始化 session sess = tf.Session() if verbose >= 1: # 打印所有可训练的变量 total_parameters = 0 for variable in tf.trainable_variables(): # shape 是tf.Dimension的一个数组 shape = variable.get_shape() print("Variable shape: ", shape) variable_parameters = 1 for dim in shape: print("Current dimension: ", dim) variable_parameters *= dim.value print("Variable params: ", variable_parameters) total_parameters += variable_parameters print('') print("TOTAL TRAINABLE PARAMS: ", total_parameters) print('Initializing TF variables...') sess.run(tf.global_variables_initializer()) if verbose >= 1: print('Starting GAE training!') start_time = time.time() # 训练模型 train_loss = [] train_acc = [] val_roc = [] val_ap = [] for epoch in range(EPOCHS): t = time.time() # 构造 feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders) feed_dict.update({placeholders['dropout']: DROPOUT}) # 单一权重更新 outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # 计算平均损失 avg_cost = outs[1] avg_accuracy = outs[2] # 评估预测 feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) prev_embs.append(gae_emb) gae_score_matrix = np.dot(gae_emb, gae_emb.T) roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True) val_roc_score.append(roc_curr) # 每次迭代打印结果 # if verbose == 2: # print(("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), # "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), # "val_ap=", "{:.5f}".format(ap_curr), # "time=", "{:.5f}".format(time.time() - t))) train_loss.append(avg_cost) train_acc.append(avg_accuracy) val_roc.append(val_roc_score[-1]) val_ap.append(ap_curr) # 画出训练过程损失和准确度以及验证AUC和AP #draw_gae_training('hamster', EPOCHS, train_loss, train_acc, val_roc, val_ap) runtime = time.time() - start_time if verbose == 2: print("Optimization Finished!") # 打印最终结果 feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) # 点积边得分 if edge_score_mode == "dot-product": gae_score_matrix = np.dot(gae_emb, gae_emb.T) # runtime = time.time() - start_time # 计算最终得分 gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix) gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix) # 采取自举边嵌入 (通过哈达玛积) elif edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list): embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = gae_emb[node1] emb2 = gae_emb[node2] edge_emb = np.multiply(emb1, emb2) embs.append(edge_emb) embs = np.array(embs) return embs # 训练集 边嵌入 pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # 创建训练集 边标签: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) # 验证集 边嵌入,标签 if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs = get_edge_embeddings(val_edges) neg_val_edge_embs = get_edge_embeddings(val_edges_false) val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # 测试集 边嵌入,标签 pos_test_edge_embs = get_edge_embeddings(test_edges) neg_test_edge_embs = get_edge_embeddings(test_edges_false) test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # 创建验证集 边标签: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) # 在训练集边嵌入上训练逻辑回归分类器 edge_classifier = LogisticRegression(random_state=0, solver='liblinear') edge_classifier.fit(train_edge_embs, train_edge_labels) #预测边得分: 分为1类(真实边)的概率 if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] #runtime = time.time() - start_time # 计算得分 if len(val_edges) > 0 and len(val_edges_false) > 0: gae_val_roc = roc_auc_score(val_edge_labels, val_preds) gae_val_roc_curve = roc_curve(val_edge_labels, val_preds) gae_val_ap = average_precision_score(val_edge_labels, val_preds) else: gae_val_roc = None gae_val_roc_curve = None gae_val_ap = None gae_test_roc = roc_auc_score(test_edge_labels, test_preds) gae_test_roc_curve = roc_curve(test_edge_labels, test_preds) gae_test_pr_curve = precision_recall_curve(test_edge_labels, test_preds) gae_test_ap = average_precision_score(test_edge_labels, test_preds) # 记录得分 gae_scores = {} gae_scores['test_roc'] = gae_test_roc gae_scores['test_ap'] = gae_test_ap gae_scores['val_roc'] = gae_val_roc gae_scores['val_ap'] = gae_val_ap if(edge_score_mode=="edge-emb"): gae_scores['test_roc_curve'] = gae_test_roc_curve gae_scores['val_roc_curve'] = gae_val_roc_curve gae_scores['test_pr_curve'] = gae_test_pr_curve gae_scores['val_roc_per_epoch'] = val_roc_score gae_scores['runtime'] = runtime return gae_scores