Esempio n. 1
0

#step 3, find all the train edges
#adj which is used for getting the samples
d = np.zeros([N_d, N_d])
m = np.zeros([N_m, N_m])
adj111 = np.hstack((m, M_D))
adj222 = np.hstack((M_D.transpose(), d))
adj_MD = np.vstack((adj111, adj222))
label_matrix = adj_MD
#k=47, 56,59,61,85,129,144,187,204,221,270,272,289,321,330
#pos_sample_n = sum(adj_MD[0:577,k+576])
#print("number of samples", pos_sample_n)
adj_MD = sp.coo_matrix(adj_MD)

edges_all, edges_pos, edges_false = mask_test_edges(
    adj_MD)  #将数据分为新的adj矩阵(对称的),train边(不含对称的),vali边,vali 负边,test边,test负边,
#print(edges_pos.shape)
#np.save('edges_all.npy',edges_all)
#np.save('edges_pos',edges_pos)
#np.save('edges_false',edges_false)
#edges_all = np.load('edges_all.npy')
#edges_pos = np.load('edges_pos.npy')
#edges_false = np.load('edges_false.npy')
X_sample = np.vstack(
    (edges_pos, edges_false)
)  #all the samples, inlculde all the positive samples and the same number of negative samples
print("edges_pos", len(edges_pos), "edges_neg", len(edges_false))
print("samples numb in miRNA-disease associations part",
      len(find_mi_D(X_sample)))
Y_sample = np.hstack((np.ones(len(edges_pos)), np.zeros(len(edges_false))))
###########################################################################################################################################################
TRAIN_TEST_SPLITS_FOLDER = './train-test-splits/'

# 遍历隐藏比例来划分数据集(训练集、验证集、测试集)
for frac_hidden in FRAC_EDGES_HIDDEN:
    val_frac = 0.05
    test_frac = frac_hidden - val_frac

    # 遍历每个网络进行划分
    for g_name, graph_tuple in fb_graphs.items():
        adj = graph_tuple[0]
        feat = graph_tuple[1]

        current_graph = 'fb-{}-{}-hidden'.format(g_name, frac_hidden)

        # 输入当前网络
        print("Current graph: ", current_graph)

        np.random.seed(RANDOM_SEED)

        # 调用函数划分
        train_test_split = mask_test_edges(adj,
                                           test_frac=test_frac,
                                           val_frac=val_frac,
                                           verbose=True)

        file_name = TRAIN_TEST_SPLITS_FOLDER + current_graph + '.pkl'

        # 保存划分数据
        with open(file_name, 'wb') as f:
            pickle.dump(train_test_split, f, protocol=2)
def calculate_all_scores(adj_sparse, features_matrix=None, directed=False, \
        test_frac=.3, val_frac=.1, random_state=0, verbose=1, \
        train_test_split_file=None,
        tf_dtype=tf.float32):
    np.random.seed(random_state) # Guarantee consistent train/test split
    tf.set_random_seed(random_state) # Consistent GAE training

    # Prepare LP scores dictionary
    lp_scores = {}

    ### ---------- PREPROCESSING ---------- ###
    train_test_split = None
    try: # If found existing train-test split, use that file
        with open(train_test_split_file, 'rb') as f:
            train_test_split = pickle.load(f)
            print 'Found existing train-test split!'
    except: # Else, generate train-test split on the fly
        print 'Generating train-test split...'
        if directed == False:
            train_test_split = mask_test_edges(adj_sparse, test_frac=test_frac, val_frac=val_frac)
        else:
            train_test_split = mask_test_edges_directed(adj_sparse, test_frac=test_frac, val_frac=val_frac)
    
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack tuple

    # g_train: new graph object with only non-hidden edges
    if directed == True:
        g_train = nx.DiGraph(adj_train)
    else:
        g_train = nx.Graph(adj_train)

    # Inspect train/test split
    if verbose >= 1:
        print "Total nodes:", adj_sparse.shape[0]
        print "Total edges:", int(adj_sparse.nnz/2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
        print "Training edges (positive):", len(train_edges)
        print "Training edges (negative):", len(train_edges_false)
        print "Validation edges (positive):", len(val_edges)
        print "Validation edges (negative):", len(val_edges_false)
        print "Test edges (positive):", len(test_edges)
        print "Test edges (negative):", len(test_edges_false)
        print ''
        print "------------------------------------------------------"


    ### ---------- LINK PREDICTION BASELINES ---------- ###
    # Adamic-Adar
    aa_scores = adamic_adar_scores(g_train, train_test_split)
    lp_scores['aa'] = aa_scores
    if verbose >= 1:
        print ''
        print 'Adamic-Adar Test ROC score: ', str(aa_scores['test_roc'])
        print 'Adamic-Adar Test AP score: ', str(aa_scores['test_ap'])

    # Jaccard Coefficient
    jc_scores = jaccard_coefficient_scores(g_train, train_test_split)
    lp_scores['jc'] = jc_scores
    if verbose >= 1:
        print ''
        print 'Jaccard Coefficient Test ROC score: ', str(jc_scores['test_roc'])
        print 'Jaccard Coefficient Test AP score: ', str(jc_scores['test_ap'])

    # Preferential Attachment
    pa_scores = preferential_attachment_scores(g_train, train_test_split)
    lp_scores['pa'] = pa_scores
    if verbose >= 1:
        print ''
        print 'Preferential Attachment Test ROC score: ', str(pa_scores['test_roc'])
        print 'Preferential Attachment Test AP score: ', str(pa_scores['test_ap'])


    ### ---------- SPECTRAL CLUSTERING ---------- ###
    sc_scores = spectral_clustering_scores(train_test_split)
    lp_scores['sc'] = sc_scores
    if verbose >= 1:
        print ''
        print 'Spectral Clustering Validation ROC score: ', str(sc_scores['val_roc'])
        print 'Spectral Clustering Validation AP score: ', str(sc_scores['val_ap'])
        print 'Spectral Clustering Test ROC score: ', str(sc_scores['test_roc'])
        print 'Spectral Clustering Test AP score: ', str(sc_scores['test_ap'])


    ## ---------- NODE2VEC ---------- ###
    # node2vec settings
    # NOTE: When p = q = 1, this is equivalent to DeepWalk
    P = 1 # Return hyperparameter
    Q = 1 # In-out hyperparameter
    WINDOW_SIZE = 10 # Context size for optimization
    NUM_WALKS = 10 # Number of walks per source
    WALK_LENGTH = 80 # Length of walk per source
    DIMENSIONS = 128 # Embedding dimension
    DIRECTED = False # Graph directed/undirected
    WORKERS = 8 # Num. parallel workers
    ITER = 1 # SGD epochs

    # Using bootstrapped edge embeddings + logistic regression
    n2v_edge_emb_scores = node2vec_scores(g_train, train_test_split,
        P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER,
        "edge-emb",
        verbose)
    lp_scores['n2v_edge_emb'] = n2v_edge_emb_scores

    if verbose >= 1:
        print ''
        print 'node2vec (Edge Embeddings) Validation ROC score: ', str(n2v_edge_emb_scores['val_roc'])
        print 'node2vec (Edge Embeddings) Validation AP score: ', str(n2v_edge_emb_scores['val_ap'])
        print 'node2vec (Edge Embeddings) Test ROC score: ', str(n2v_edge_emb_scores['test_roc'])
        print 'node2vec (Edge Embeddings) Test AP score: ', str(n2v_edge_emb_scores['test_ap'])

    # Using dot products to calculate edge scores
    n2v_dot_prod_scores = node2vec_scores(g_train, train_test_split,
        P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER,
        "dot-product",
        verbose)
    lp_scores['n2v_dot_prod'] = n2v_dot_prod_scores

    if verbose >= 1:
        print ''
        print 'node2vec (Dot Product) Validation ROC score: ', str(n2v_dot_prod_scores['val_roc'])
        print 'node2vec (Dot Product) Validation AP score: ', str(n2v_dot_prod_scores['val_ap'])
        print 'node2vec (Dot Product) Test ROC score: ', str(n2v_dot_prod_scores['test_roc'])
        print 'node2vec (Dot Product) Test AP score: ', str(n2v_dot_prod_scores['test_ap'])


    ### ---------- (VARIATIONAL) GRAPH AUTOENCODER ---------- ###
    # # GAE hyperparameters
    # LEARNING_RATE = 0.001 # Default: 0.01
    # EPOCHS = 200
    # HIDDEN1_DIM = 32
    # HIDDEN2_DIM = 16
    # DROPOUT = 0

    # # Use dot product
    # tf.set_random_seed(random_state) # Consistent GAE training
    # gae_results = gae_scores(adj_sparse, train_test_split, features_matrix,
    #     LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT,
    #     "dot-product",
    #     verbose,
    #     dtype=tf.float16)
    # lp_scores['gae'] = gae_results

    # if verbose >= 1:
    #     print ''
    #     print 'GAE (Dot Product) Validation ROC score: ', str(gae_results['val_roc'])
    #     print 'GAE (Dot Product) Validation AP score: ', str(gae_results['val_ap'])
    #     print 'GAE (Dot Product) Test ROC score: ', str(gae_results['test_roc'])
    #     print 'GAE (Dot Product) Test AP score: ', str(gae_results['test_ap'])


    # # Use edge embeddings
    # tf.set_random_seed(random_state) # Consistent GAE training
    # gae_edge_emb_results = gae_scores(adj_sparse, train_test_split, features_matrix,
    #     LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT,
    #     "edge-emb",
    #     verbose)
    # lp_scores['gae_edge_emb'] = gae_edge_emb_results

    # if verbose >= 1:
    #     print ''
    #     print 'GAE (Edge Embeddings) Validation ROC score: ', str(gae_edge_emb_results['val_roc'])
    #     print 'GAE (Edge Embeddings) Validation AP score: ', str(gae_edge_emb_results['val_ap'])
    #     print 'GAE (Edge Embeddings) Test ROC score: ', str(gae_edge_emb_results['test_roc'])
    #     print 'GAE (Edge Embeddings) Test AP score: ', str(gae_edge_emb_results['test_ap'])


    ### ---------- RETURN RESULTS ---------- ###
    return lp_scores
# draw network
# nx.draw_networkx(Gr, with_labels=False, node_size=50, node_color='r')
# plt.show()

# ## 2. Preprocessing/Train-Test Split

# In[ ]:

from gae.preprocessing import mask_test_edges
np.random.seed(0)  # make sure train-test split is consistent between notebooks
adj_sparse = nx.to_scipy_sparse_matrix(Gr)

# In[ ]:

# Perform train-test split
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
    adj_sparse, test_frac=.2, val_frac=.1)
g_train = nx.from_scipy_sparse_matrix(
    adj_train)  # new graph object with only non-hidden edges

# In[67]:

# Inspect train/test split
print "Total nodes:", adj_sparse.shape[0]
print "Total edges:", int(
    adj_sparse.nnz /
    2)  # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print "Training edges (positive):", len(train_edges)
print "Training edges (negative):", len(train_edges_false)
print "Validation edges (positive):", len(val_edges)
print "Validation edges (negative):", len(val_edges_false)
print "Test edges (positive):", len(test_edges)
Esempio n. 5
0
flags.DEFINE_string('dataset', 'cora', 'Dataset string.')
flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).')
flags.DEFINE_string("checkpoint_dir", "checkpoints", "checkpoint directory")

model_str = FLAGS.model
dataset_str = FLAGS.dataset

# Load data
adj, features = load_data(dataset_str)

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
adj = adj_train

if FLAGS.features == 0:
    features = sp.identity(features.shape[0])  # featureless

# Some preprocessing
adj_norm = preprocess_graph(adj)

# Define placeholders
placeholders = {
    'features': tf.sparse_placeholder(tf.float32),
    'adj': tf.sparse_placeholder(tf.float32),
    'adj_orig': tf.sparse_placeholder(tf.float32),
    'dropout': tf.placeholder_with_default(0., shape=())
}
Esempio n. 6
0
for i in MasterNodes:
    if i not in G6.nodes():
        G6.add_node(i)
        
adj_sparse = nx.to_scipy_sparse_matrix(G6)


# In[7]:


from gae.preprocessing import mask_test_edges
np.random.seed(0) # make sure train-test split is consistent between notebooks

adj_sparse = nx.to_scipy_sparse_matrix(G6)

adj_train, train_edges, train_edges_false, val_edges, val_edges_false,     test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=.3, val_frac=.0, prevent_disconnect = True)


# In[6]:


# Inspect train/test split
print "Total nodes:", adj_sparse.shape[0]
print "Total edges:", int(adj_sparse.nnz/2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print "Training edges (positive):", len(train_edges)
print "Training edges (negative):", len(train_edges_false)
print "Validation edges (positive):", len(val_edges)
print "Validation edges (negative):", len(val_edges_false)
print "Test edges (positive):", len(test_edges)
print "Test edges (negative):", len(test_edges_false)
Esempio n. 7
0
g = nx.Graph(adj)  # re-create graph using node indices (0 to num_nodes-1)

# draw network
nx.draw_networkx(g, with_labels=False, node_size=50, node_color='r')
plt.show()

#############################################
# 2. Preprocessing/Train-Test Split

np.random.seed(0)  # make sure train-test split is consistent between notebooks
adj_sparse = nx.to_scipy_sparse_matrix(g)

# Perform train-test split
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = mask_test_edges(
        adj_sparse, test_frac=.3, val_frac=.1)
# new graph object with only non-hidden edges
g_train = nx.from_scipy_sparse_matrix(adj_train)

# Inspect train/test split
print("Total nodes:", adj_sparse.shape[0])
# adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Total edges:", int(adj_sparse.nnz / 2))
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
print("Validation edges (positive):", len(val_edges))
print("Validation edges (negative):", len(val_edges_false))
print("Test edges (positive):", len(test_edges))
print("Test edges (negative):", len(test_edges_false))

#############################################
Esempio n. 8
0
def gae(filename, output_dir):

    # Settings
    flags = tf.app.flags
    FLAGS = flags.FLAGS
    flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
    flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
    flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.')
    flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.')
    flags.DEFINE_float('weight_decay', 0.,
                       'Weight for L2 loss on embedding matrix.')
    flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).')
    flags.DEFINE_string('filename', 'email-Eu-core.mat', 'dataset')
    flags.DEFINE_string('model', 'gcn_vae', 'Model string.')
    flags.DEFINE_string('dataset', 'cora', 'Dataset string.')
    flags.DEFINE_integer('features', 0,
                         'Whether to use features (1) or not (0).')

    model_str = FLAGS.model
    # dataset_str = FLAGS.dataset

    # Load data
    # adj, features = load_data(dataset_str)
    adj, R, edges = load_network_data(filename)

    num_edges = np.sum(adj)
    length = adj.shape[0]
    A = np.array(adj, copy=True)
    adj = sp.csr_matrix(adj)
    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges = mask_test_edges(adj)
    adj = adj_train

    if FLAGS.features == 0:
        features = sp.identity(adj.shape[0])  # featureless

    # Some preprocessing
    adj_norm = preprocess_graph(adj)

    # Define placeholders
    placeholders = {
        'features': tf.sparse_placeholder(tf.float32),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    num_nodes = adj.shape[0]

    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, num_features, features_nonzero)
    elif model_str == 'gcn_vae':
        model = GCNModelVAE(placeholders, num_features, num_nodes,
                            features_nonzero)

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(
                                  tf.sparse_tensor_to_dense(
                                      placeholders['adj_orig'],
                                      validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(
                                   tf.sparse_tensor_to_dense(
                                       placeholders['adj_orig'],
                                       validate_indices=False), [-1]),
                               model=model,
                               num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    # Train model
    for epoch in range(FLAGS.epochs):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                        placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)
        # Compute average loss
        # avg_cost = outs[1]
        # avg_accuracy = outs[2]
        #
        # if (epoch + 1) % 10 == 0:
        #     print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
        #           "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t))

    print("GAE Optimization Finished!")

    feed_dict.update({placeholders['dropout']: 0})
    emb = sess.run(model.z_mean, feed_dict=feed_dict)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Predict on test set of edges
    adj_rec = np.dot(emb, emb.T)
    adj_rec = np.array(adj_rec)
    # adj_rec = adj_rec[1:length, :][:, 1:length]
    DD = np.sort(adj_rec.flatten())
    threshold = DD[int(-1 * num_edges)]
    network_C = np.array([[
        0 if adj_rec[i, j] < threshold else 1 for i in range(adj_rec.shape[0])
    ] for j in range(adj_rec.shape[1])],
                         dtype=np.int8)
    # np.save('../data/GAE_network.npy', network_C[1:length, :][:, 1:length])
    os.chdir('../')
    np.save('{}/GAE_network.npy'.format(output_dir, filename),
            network_C[1:length, :][:, 1:length])

    A_copy = adj_rec
    final_network = [A_copy]
    # orinal_network = [A]
    for i in range(1, 5):
        adjacent_matrix = tf.placeholder(tf.float32, shape=A_copy.shape)
        R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape)
        A_copy = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix),
                                    tf.transpose(R_matrix)),
                          feed_dict={
                              R_matrix: R[i - 1, 0].todense(),
                              adjacent_matrix: A_copy
                          })
        final_network.append(np.array(A_copy))

        # adjacent_matrix = tf.placeholder(tf.float32, shape=A.shape)
        # R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape)
        # A = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)),
        #                        feed_dict={R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A})
        # orinal_network.append(A)
    # draw_graph(final_network, edges, output_dir)
    network_B = final_network[0]
    print('Generating graph by GAE algorithm.')
    DD = np.sort(network_B.flatten())[::-1]
    threshold = DD[edges[0, 0]]
    network_C = np.array([[
        0 if network_B[i, j] < threshold else 1
        for i in range(network_B.shape[0])
    ] for j in range(network_B.shape[1])])
    _A_obs = network_C + network_C.T
    _A_obs[_A_obs > 1] = 1
    _A_obs = np.array(_A_obs)
    print('Computing metrics for graph generated by GAE')
    c = compute_graph_statistics(_A_obs)
    with open('{}/gae_network_statistics.pickle'.format(output_dir),
              'wb') as handle:
        pickle.dump(c, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(c)
Esempio n. 9
0
    def __init__(self, graph_edgelist, num_actions, dimension, learning_rate=0.01, epochs=300, hidden1=32, hidden2=16,
                 dropout=0., model_str='gcn_vae', use_features=0):

        """Initialize ExactBasis."""
        if graph_edgelist is None:
            raise ValueError('graph cannot be None')

        if dimension < 1:
            raise ValueError('dimension must be >= 1')

        self.__num_actions = BasisFunction._validate_num_actions(num_actions)

        self._dimension = dimension

        adj, features = self.read_graph(graph_edgelist)

        # Store original adjacency matrix (without diagonal entries) for later
        adj_orig = adj
        adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
        adj_orig.eliminate_zeros()

        adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
        # adj = adj_train

        if use_features == 0:
            features = sp.identity(features.shape[0])  # featureless

        # Some preprocessing
        adj_norm = preprocess_graph(adj)

        # Define placeholders
        placeholders = {
            'features': tf.sparse_placeholder(tf.float32),
            'adj': tf.sparse_placeholder(tf.float32),
            'adj_orig': tf.sparse_placeholder(tf.float32),
            'dropout': tf.placeholder_with_default(0., shape=())
        }

        num_nodes = adj.shape[0]

        features = sparse_to_tuple(features.tocoo())
        num_features = features[2][1]
        features_nonzero = features[1].shape[0]

        # Create model
        model = None
        if model_str == 'gcn_ae':
            model = GCNModelAE(placeholders, num_features, features_nonzero, hidden1, hidden2, dimension)
        elif model_str == 'gcn_vae':
            model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, hidden1, dimension)

        pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
        norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

        # Optimizer
        with tf.name_scope('optimizer'):
            if model_str == 'gcn_ae':
                opt = OptimizerAE(preds=model.reconstructions,
                                  labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                              validate_indices=False), [-1]),
                                  pos_weight=pos_weight,
                                  norm=norm, learning_rate=learning_rate)
            elif model_str == 'gcn_vae':
                opt = OptimizerVAE(preds=model.reconstructions,
                                   labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                               validate_indices=False), [-1]),
                                   model=model, num_nodes=num_nodes,
                                   pos_weight=pos_weight,
                                   norm=norm, learning_rate=learning_rate)

        # Initialize session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = sparse_to_tuple(adj_label)

        # Train model
        for epoch in range(epochs):
            t = time.time()
            # Construct feed dictionary
            feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
            feed_dict.update({placeholders['dropout']: dropout})
            # Run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

        print("GCN Optimization Finished!")

        feed_dict.update({placeholders['dropout']: 0})
        self.embeddings = sess.run(model.z_mean, feed_dict=feed_dict)
Esempio n. 10
0
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

degrees = []
for i in range(adj.shape[0]):
    r = adj.getrow(i).toarray().flatten()
    nz = np.nonzero(r)
    degrees.append(len(nz[0]))
deg_matrix = sp.diags([degrees], [0])

logging.debug('Some preprocessing done')

np.random.seed(0) # IMPORTANT: guarantees consistent train/test splits
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = mask_test_edges(adj, test_frac=.3, val_frac=.1, verbose=True)

logging.debug('Splitting done')

# Normalize adjacency matrix
# adj_norm = normalize(adj_train, axis=1)
adj_norm = deg_matrix.tocsr() * adj_train.tocsr() * deg_matrix.tocsr()
logging.debug('Preprocessed graph')

# Add in diagonals
adj_label_mat = adj_orig + sp.eye(adj_orig.shape[0])
adj_label = sparse_to_tuple(adj_label_mat)

# Inspect train/test split
print("Total nodes:", adj.shape[0])
print("Total edges:", int(adj.nnz/2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
Esempio n. 11
0
flags.DEFINE_string('model', 'gcn_ae', 'Model string.')
flags.DEFINE_string('dataset', 'cora', 'Dataset string.')
flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).')

model_str = FLAGS.model
dataset_str = FLAGS.dataset

# Load data
adj, features = load_data(dataset_str)

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
adj = adj_train

if FLAGS.features == 0:
    features = sp.identity(features.shape[0])  # featureless

# Some preprocessing
adj_norm = preprocess_graph(adj)

# Define placeholders
placeholders = {
    'features': tf.sparse_placeholder(tf.float32),
    'adj': tf.sparse_placeholder(tf.float32),
    'adj_orig': tf.sparse_placeholder(tf.float32),
    'dropout': tf.placeholder_with_default(0., shape=())
}
Esempio n. 12
0
    def gcn_multilayer(self):
        """Neural embedding of a multilayer network"""
        all_nodes = self.get_all_nodes()
        tmp_fname = pjoin(self.out_dir, 'tmp.emb')
        for net_name, net in self.nets.items():
            self.log.info('Run GCN For Net: %s' % net_name)
            # =============================================================
            adjacency_matrix = nx.adjacency_matrix(net)
            adjacency_matrix = adjacency_matrix.todense()
            nodes_count = adjacency_matrix.shape[0]
            adj = adjacency_matrix
            features = sp.identity(nodes_count)
            adj = sp.csr_matrix(adj)
            # ----------------myCode-----------------------------------
            # Store original adjacency matrix (without diagonal entries) for later
            adj_orig = adj
            adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
            adj_orig.eliminate_zeros()
            # tst_actual_matrix = adj.toarray()
            adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
            adj = adj_train
            # -----------------------------myCode-------------------------
            # if FLAGS.features == 0:
            #    features = sp.identity(features.shape[0])  # featureless
            # -----------------------------myCode-------------------------
            # Some pre processing
            adj_norm = preprocess_graph(adj)
            # Define placeholders
            placeholders = {
                'features': tf.sparse_placeholder(tf.float32),
                'adj': tf.sparse_placeholder(tf.float32),
                'adj_orig': tf.sparse_placeholder(tf.float32),
                'dropout': tf.placeholder_with_default(0., shape=())
            }
            num_nodes = adj.shape[0]
            features = sparse_to_tuple(features.tocoo())
            num_features = features[2][1]
            features_nonzero = features[1].shape[0]
            # Create model
            model = None
            if self.model_str == 'gcn_ae':
                model = GCNModelAE(placeholders, num_features, features_nonzero, self.hidden1, self.hidden2)
            elif self.model_str == 'gcn_vae':
                model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, self.hidden1, self.hidden2)

            pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
            norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

            # Optimizer
            with tf.name_scope('optimizer'):
                if self.model_str == 'gcn_ae':
                    opt = OptimizerAE(preds=model.reconstructions,
                                      labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                                  validate_indices=False), [-1]),
                                      pos_weight=pos_weight,
                                      norm=norm)
                elif self.model_str == 'gcn_vae':
                    opt = OptimizerVAE(preds=model.reconstructions,
                                       labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                                   validate_indices=False), [-1]),
                                       model=model, num_nodes=num_nodes,
                                       pos_weight=pos_weight,
                                       norm=norm)

            # Initialize session
            sess = tf.Session()
            sess.run(tf.global_variables_initializer())

            cost_val = []
            acc_val = []

            def get_roc_score(edges_pos, edges_neg, emb=None):
                if emb is None:
                    feed_dict.update({placeholders['dropout']: 0})
                    emb = sess.run(model.z_mean, feed_dict=feed_dict)

                def sigmoid(x):
                    return 1 / (1 + np.exp(-x))

                # Predict on test set of edges
                adj_rec = np.dot(emb, emb.T)
                preds = []
                pos = []
                for e in edges_pos:
                    preds.append(sigmoid(adj_rec[e[0], e[1]]))
                    pos.append(adj_orig[e[0], e[1]])

                preds_neg = []
                neg = []
                for e in edges_neg:
                    preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
                    neg.append(adj_orig[e[0], e[1]])

                preds_all = np.hstack([preds, preds_neg])
                labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
                roc_score = roc_auc_score(labels_all, preds_all)
                ap_score = average_precision_score(labels_all, preds_all)

                return roc_score, ap_score

            cost_val = []
            acc_val = []
            val_roc_score = []
            adj_label = adj_train + sp.eye(adj_train.shape[0])
            adj_label = sparse_to_tuple(adj_label)
            # Train model
            # for epoch in range(FLAGS.epochs):
            # epochs = 10
            dropout = 0
            for epoch in range(self.n_iter):
                self.log.info('Iteration: %d' % epoch)
                t = time.time()
                # Construct feed dictionary
                feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
                # feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                # -----------myCode------------
                feed_dict.update({placeholders['dropout']: dropout})
                # -----------myCode------------
                # Run single weight update
                outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

                # Compute average loss
                avg_cost = outs[1]
                avg_accuracy = outs[2]

                roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false)
                val_roc_score.append(roc_curr)

                print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
                      "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
                      "val_ap=", "{:.5f}".format(ap_curr),
                      "time=", "{:.5f}".format(time.time() - t))

            print("Optimization Finished!")
            roc_score, ap_score = get_roc_score(test_edges, test_edges_false)
            print('Test ROC score: ' + str(roc_score))
            print('Test AP score: ' + str(ap_score))

            # ------vector generation -----------------------------
            vectors = sess.run(model.embeddings, feed_dict=feed_dict)
            fname = self.out_dir + net_name +'vectors.txt'
            # with open(fname, 'a+') as fout:
            #     for line in np.array(vectors):
            #         fout.write(line + "\n")
            np.savetxt(fname, np.array(vectors), fmt="%s", delimiter='  ')
            self.log.info('Saving vectors: %s' % fname)
            # ==============================================================
            self.log.info('after exec gcn : %s' % net_name)

        self.log.info('Done!')
# TODO = ['fb-combined-0.75-hidden']

# Iterate over fractions of edges to hide
for frac_hidden in FRAC_EDGES_HIDDEN:
    val_frac = 0.1
    test_frac = frac_hidden - val_frac
    
    # Iterate over each graph
    for g_name, graph_tuple in fb_graphs.iteritems():
        adj = graph_tuple[0]
        feat = graph_tuple[1]
        
        current_graph = 'fb-{}-{}-hidden'.format(g_name, frac_hidden)
        
        # if current_graph in TODO:
        print "Current graph: ", current_graph

        np.random.seed(RANDOM_SEED)
        
        # Run all link prediction methods on current graph, store results
        train_test_split = mask_test_edges(adj, test_frac=test_frac, val_frac=val_frac,
            verbose=True)

        file_name = TRAIN_TEST_SPLITS_FOLDER + current_graph + '.pkl'

        # Save split
        with open(file_name, 'wb') as f:
            pickle.dump(train_test_split, f, protocol=2)


Esempio n. 14
0
    print(f"adj type, {type(adj)}")
    print(f"adj.shape, {adj.shape}")
    print(f"adj[:10,:10], {adj[:10, :10]}")
    print(f"adj {adj}")
    print(f"feature.shape, {features.shape}")
# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix(
    (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
if DEBUG:
    print(f"ad_orig type, {type(adj_orig)}")
    print(f"adj_orig.shape, {adj_orig.shape}")

adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
    adj, random_seed=42)
if DEBUG:
    print(f"adj_train type, {type(adj_train)}")
    print(f"adj_train shape, {type(adj_train.shape)}")
    print(f"train_edges type, {type(train_edges)}")
    print('*' * 20)
    print(f"train_edges shape, {train_edges.shape}")
    print(f"val_edges shape, {val_edges.shape}")
    print(f"test_edges shape, {test_edges.shape}")
    print('*' * 20)
    print(f"val_edges_false type, {type(val_edges_false)}")
    print(f"test_edges_false type, {type(test_edges_false)}")
    print(f"len val edges false, {len(val_edges_false)}")
    print(f"len test edges false, {len(test_edges_false)}")
    print('*' * 20)
    print(f"val_edges[:20], {val_edges[:20]}")
Esempio n. 15
0
def calculate_all_scores(adj_sparse, features_matrix=None, directed=False, \
        test_frac=.1, val_frac=.05, random_state=0, verbose=1, \
        train_test_split_file=None,
        tf_dtype=tf.float32):
    np.random.seed(random_state) # Guarantee consistent train/test split
    tf.set_random_seed(random_state) # Consistent GAE training

    # 链路预测得分字典
    lp_scores = {}

    ### ---------- 预处理 ---------- ###
    train_test_split = None
    try: # 如果找到存在的划分好的数据集,则使用找到的文件
        with open(train_test_split_file, 'rb') as f:
            train_test_split = pickle.load(f)
            print('Found existing train-test split!')
    except: # 否则, 生成数据划分集
        print('Generating train-test split...')
        if directed == False:
            train_test_split = mask_test_edges(adj_sparse, test_frac=test_frac, val_frac=val_frac)
        else:
            train_test_split = mask_test_edges_directed(adj_sparse, test_frac=test_frac, val_frac=val_frac)
    
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split  # 打开元组

    # g_train: 完整的图对象(没有隐藏边)
    if directed == True:
        g_train = nx.DiGraph(adj_train)
    else:
        g_train = nx.Graph(adj_train)

    # 检查训练集测试集划分
    if verbose >= 1:
        print("Total nodes:", adj_sparse.shape[0])
        print("Total edges:", int(adj_sparse.nnz/2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
        print("Training edges (positive):", len(train_edges))
        print("Training edges (negative):", len(train_edges_false))
        print("Validation edges (positive):", len(val_edges))
        print("Validation edges (negative):", len(val_edges_false))
        print("Test edges (positive):", len(test_edges))
        print("Test edges (negative):", len(test_edges_false))
        print('')
        print("------------------------------------------------------")


    # ---------- 链路预测基线方法---------- ###
    # # Adamic-Adar
    aa_scores = adamic_adar_scores(g_train, train_test_split)
    lp_scores['aa'] = aa_scores
    if verbose >= 1:
        print('')
        print('Adamic-Adar Test ROC score: ', str(aa_scores['test_roc']))
        print('Adamic-Adar Test AP score: ', str(aa_scores['test_ap']))

    # Jaccard Coefficient
    jc_scores = jaccard_coefficient_scores(g_train, train_test_split)
    lp_scores['jc'] = jc_scores
    if verbose >= 1:
        print('')
        print('Jaccard Coefficient Test ROC score: ', str(jc_scores['test_roc']))
        print('Jaccard Coefficient Test AP score: ', str(jc_scores['test_ap']))

    # Preferential Attachment
    pa_scores = preferential_attachment_scores(g_train, train_test_split)
    lp_scores['pa'] = pa_scores
    if verbose >= 1:
        print('')
        print('Preferential Attachment Test ROC score: ', str(pa_scores['test_roc']))
        print('Preferential Attachment Test AP score: ', str(pa_scores['test_ap']))


    ### ---------- SPECTRAL CLUSTERING ---------- ###
    sc_scores = spectral_clustering_scores(train_test_split)
    lp_scores['sc'] = sc_scores
    if verbose >= 1:
        print('')
        print('Spectral Clustering Validation ROC score: ', str(sc_scores['val_roc']))
        print('Spectral Clustering Validation AP score: ', str(sc_scores['val_ap']))
        print('Spectral Clustering Test ROC score: ', str(sc_scores['test_roc']))
        print('Spectral Clustering Test AP score: ', str(sc_scores['test_ap']))
        print('')

    ## ---------- NODE2VEC ---------- ###
    # node2vec 参数设置
    # 当 p = q = 1, Node2Vec等同于DeepWalk
    P = 1 # 返回概率参数
    Q = 1 # 进出概率参数
    WINDOW_SIZE = 10 # 优化的上下文大小
    NUM_WALKS = 10 # 每次源的游走次数
    WALK_LENGTH = 80 # 每次源的游走序列长度
    DIMENSIONS = 128 # 嵌入维度
    DIRECTED = False # 有向/无向图
    WORKERS = 8 # 平行游者的数量
    ITER = 1 # SGD 迭代次数

    # 使用自举边嵌入+逻辑回归
    n2v_edge_emb_scores = node2vec_scores(g_train, train_test_split,
        P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER,
        "edge-emb",
        verbose)
    lp_scores['n2v_edge_emb'] = n2v_edge_emb_scores

    if verbose >= 1:
        print('')
        print('node2vec (Edge Embeddings) Validation ROC score: ', str(n2v_edge_emb_scores['val_roc']))
        print('node2vec (Edge Embeddings) Validation AP score: ', str(n2v_edge_emb_scores['val_ap']))
        print('node2vec (Edge Embeddings) Test ROC score: ', str(n2v_edge_emb_scores['test_roc']))
        print('node2vec (Edge Embeddings) Test AP score: ', str(n2v_edge_emb_scores['test_ap']))
        print('')

    # 使用点积计算边得分
    n2v_dot_prod_scores = node2vec_scores(g_train, train_test_split,
        P, Q, WINDOW_SIZE, NUM_WALKS, WALK_LENGTH, DIMENSIONS, DIRECTED, WORKERS, ITER,
        "dot-product",
        verbose)
    lp_scores['n2v_dot_prod'] = n2v_dot_prod_scores

    if verbose >= 1:
        print('')
        print('node2vec (Dot Product) Validation ROC score: ', str(n2v_dot_prod_scores['val_roc']))
        print('node2vec (Dot Product) Validation AP score: ', str(n2v_dot_prod_scores['val_ap']))
        print('node2vec (Dot Product) Test ROC score: ', str(n2v_dot_prod_scores['test_roc']))
        print('node2vec (Dot Product) Test AP score: ', str(n2v_dot_prod_scores['test_ap']))
        print('')


    ### ---------- (VARIATIONAL) GRAPH AUTOENCODER ---------- ###
    # # GAE 参数设置
    LEARNING_RATE = 0.01  # Default: 0.01
    EPOCHS = 250
    HIDDEN1_DIM = 32
    HIDDEN2_DIM = 16
    DROPOUT = 0

    #  使用点积
    tf.set_random_seed(random_state)  # Consistent GAE training
    gae_results = gae_scores(adj_sparse, train_test_split, features_matrix,
                             LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT,
                             "dot-product",
                             verbose,
                             dtype=tf.float32)
    lp_scores['gae'] = gae_results

    if verbose >= 1:
        print('')
        print('GAE (Dot Product) Validation ROC score: ', str(gae_results['val_roc']))
        print('GAE (Dot Product) Validation AP score: ', str(gae_results['val_ap']))
        print('GAE (Dot Product) Test ROC score: ', str(gae_results['test_roc']))
        print('GAE (Dot Product) Test AP score: ', str(gae_results['test_ap']))
        print("------------------------------------------------------")
        print("------------------------------------------------------")
        print('')


    # 使用边嵌入
    tf.set_random_seed(random_state) # Consistent GAE training
    gae_edge_emb_results = gae_scores(adj_sparse, train_test_split, features_matrix,
        LEARNING_RATE, EPOCHS, HIDDEN1_DIM, HIDDEN2_DIM, DROPOUT,
        "edge-emb",
        verbose)
    lp_scores['gae_edge_emb'] = gae_edge_emb_results

    if verbose >= 1:
        print('')
        print('GAE (Edge Embeddings) Validation ROC score: ', str(gae_edge_emb_results['val_roc']))
        print('GAE (Edge Embeddings) Validation AP score: ', str(gae_edge_emb_results['val_ap']))
        #print('GAE (Edge Embeddings) Validation ROC_CURVE score: ', str(gae_edge_emb_results['val_roc_curve']))
        print('GAE (Edge Embeddings) Test ROC score: ', str(gae_edge_emb_results['test_roc']))
        print('GAE (Edge Embeddings) Test AP score: ', str(gae_edge_emb_results['test_ap']))
        #print('GAE (Edge Embeddings) Test ROC_CURVE score: ', str(gae_edge_emb_results['test_roc_curve']))


    ### ---------- 返回结果 ---------- ###
    return lp_scores
def main(filename):
    Gr = nx.read_edgelist(filename, nodetype=int, delimiter=",")
    for edge in Gr.edges():
        Gr[edge[0]][edge[1]]['weight'] = 1    
    
    # In[92]:
    
    
    
    
    # In[ ]:
    
    #print Gr.number_of_edges()
    #print Gr.number_of_nodes()
    
    
    # In[52]:
    
    # draw network
    # nx.draw_networkx(Gr, with_labels=False, node_size=50, node_color='r')
    # plt.show()
    
    
    # ## 2. Preprocessing/Train-Test Split
    
    # In[ ]:    
    
    from gae.preprocessing import mask_test_edges
    np.random.seed(0) # make sure train-test split is consistent between notebooks
    adj_sparse = nx.to_scipy_sparse_matrix(Gr)
    
    
    
    # In[ ]:
    
    # Perform train-test split
    if not filename=='m4.csv': # defineer laatste maand hier
        adj_train, train_edges, train_edges_false, val_edges_x, val_edges_false_x, test_edges_x, test_edges_false_x = mask_test_edges(adj_sparse, test_frac=0, val_frac=0)    
        g_train = nx.from_scipy_sparse_matrix(adj_train) # new graph object with only non-hidden edges
    else:
        Gr = nx.read_edgelist(filename, nodetype=int, delimiter=",")
        for edge in Gr.edges():
            Gr[edge[0]][edge[1]]['weight'] = 1    
        adj_train, train_edges_x, train_edges_false_x, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=0.67, val_frac=0.33)
        g_train = nx.from_scipy_sparse_matrix(adj_train)
    # In[67]:
    
    # Inspect train/test split
    #print "Total nodes:", adj_sparse.shape[0]
    #print "Total edges:", int(adj_sparse.nnz/2) # adj is symmetric, so nnz (num non-zero) = 2*num_edges
    #print "Training edges (positive):", len(train_edges)
    #print "Training edges (negative):", len(train_edges_false)
    #print "Validation edges (positive):", len(val_edges)
    #print "Validation edges (negative):", len(val_edges_false)
    #print "Test edges (positive):", len(test_edges)
    #print "Test edges (negative):", len(test_edges_false)
    '''
    output_training_test = open("output_split.txt", "w")
    output_training_test.write(("\n Total nodes:" + str(adj_sparse.shape[0])))
    output_training_test.write("\n Total edges:" + str(int(adj_sparse.nnz/2)))
    output_training_test.write("\n Training edges (positive): " + str(len(train_edges))) 
    output_training_test.write("\n Training edges (negative): " + str(len(train_edges_false)))
    output_training_test.write("\n Validation edges (positive): " + str(len(val_edges)))
    output_training_test.write("\n Validation edges (negative): " + str(len(val_edges_false)))
    output_training_test.write("\n Test edges (positive): " + str(len(test_edges)))
    output_training_test.write("\n Test edges (negative): " + str(len(test_edges_false)))
    
    output_training_test.close()
    '''
    #  3. Train node2vec (Learn Node Embeddings)
    
    # In[68]:
    
    import node2vec
    from gensim.models import Word2Vec
    
    
    # In[69]:
    
    # node2vec settings
    # NOTE: When p = q = 1, this is equivalent to DeepWalk
    
    P = 1 # Return hyperparameter
    Q = 1 # In-out hyperparameter
    WINDOW_SIZE = 10 # Context size for optimization
    NUM_WALKS = 10 # Number of walks per source
    WALK_LENGTH = 80 # Length of walk per source
    DIMENSIONS = 128 # Embedding dimension
    DIRECTED = False # Graph directed/undirected
    WORKERS = 8 # Num. parallel workers
    ITER = 1 # SGD epochs
    
    
    # In[70]:
    
    # Preprocessing, generate walks

    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q) # create node2vec graph instance
    g_n2v.preprocess_transition_probs()
    walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH)
    walks = [map(str, walk) for walk in walks]
    
    # Train skip-gram model
    model = Word2Vec(walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER)
    
    # Store embeddings mapping
    emb_mappings = model.wv
    
    
    # ## 4. Create Edge Embeddings
    
    # In[71]:
    
    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_sparse.shape[0]):
        node_str = str(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)
    
    
    # In[72]:
    
    # Generate bootstrapped edge embeddings (as is done in node2vec paper)
        # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
    def get_edge_embeddings(edge_list):
        embs = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            embs.append(edge_emb)
        embs = np.array(embs)
        return embs
    
    
    # In[73]:
    '''
    # Train-set edge embeddings
    pos_train_edge_embs = get_edge_embeddings(train_edges)
    neg_train_edge_embs = get_edge_embeddings(train_edges_false)
    train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])
    #
    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])
    '''#
    # Val-set edge embeddings, labels
    if filename=='m4.csv':
        
        pos_val_edge_embs = get_edge_embeddings(val_edges)
        neg_val_edge_embs = get_edge_embeddings(val_edges_false)
        val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
        val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])
        #
        # Test-set edge embeddings, labels
        pos_test_edge_embs = get_edge_embeddings(test_edges)
        neg_test_edge_embs = get_edge_embeddings(test_edges_false)
        test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])
        #
        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])
        return 69,69,val_edge_embs, val_edge_labels, test_edge_embs, test_edge_labels
    else:
        pos_train_edge_embs = get_edge_embeddings(train_edges)
        neg_train_edge_embs = get_edge_embeddings(train_edges_false)
        train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])
        #
        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])
        return train_edge_embs, train_edge_labels, 69, 69, 69, 69