def node2vec_embedding(graph, name): rw = BiasedRandomWalk(graph) walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q) print(f"Number of random walks for '{name}': {len(walks)}") model = Word2Vec( walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=num_iter, ) def get_embedding(u): #print(len(model.wv[u]),u) if u == '16039': return np.ndarray(128) elif u == '24601': return np.ndarray(128) elif u == '21450': return np.ndarray(128) elif u == '12492': return np.ndarray(128) elif u == '6506': return np.ndarray(128) elif u == '1545': return np.ndarray(128) return model.wv[u] return get_embedding
def walks(self, walklen, n1): G = nx.Graph() G = nx.read_weighted_edgelist(self.dataset + "/krnmdata1/CQAG1.txt") rw = BiasedRandomWalk(StellarGraph(G)) weighted_walks = rw.run( nodes=G.nodes(), # root nodes length=walklen, # maximum length of a random walk n=n1, # number of random walks per root node p=0.5, # Defines (unormalised) probability, 1/p, of returning to source node q=2.0, # Defines (unormalised) probability, 1/q, for moving away from source node weighted=True, #for weighted random walks seed=42 # random seed fixed for reproducibility ) print("Number of random walks: {}".format(len(weighted_walks))) #print(weighted_walks[0:10]) #remove answer nodes walks = [] for i in range(len(weighted_walks)): walk = weighted_walks[i] w = [] for node in walk: if int(node) < self.qnum: w.append(node) elif int(node) > (self.qnum + self.anum): n = int(node) - self.anum w.append(str(n)) walks.append(w) print(walks[0:10]) return walks
def learn_embeddings(self, embedding_dim=100, window_size=5, max_rw_len=50, walks_per_node=20, p=0.5, q=2.0): print('Running node2vec...') rw = BiasedRandomWalk(StellarGraph(self.graph)) walks = rw.run(nodes=list(self.graph), length=max_rw_len, n=walks_per_node, p=p, q=q) print(f'Number of random walks: {len(walks)}') print('Running word2vec...') model = Word2Vec(walks, size=embedding_dim, window=window_size, min_count=0, sg=1, workers=2, iter=1) model.init_sims(replace=True) return model.wv
def node2vec_embedding(graph, name, weighted=False): p = 1.0 q = 1.0 dimensions = 128 num_walks = 10 walk_length = 80 window_size = 10 num_iter = 1 workers = multiprocessing.cpu_count() rw = BiasedRandomWalk(graph) walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q, weighted=weighted) print(f"Number of random walks for '{name}': {len(walks)}") model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=num_iter) def get_embedding(u): return model.wv[u] return get_embedding
def node2vec_embedding(graph): p = 1.0 q = 1.0 dimensions = 128 num_walks = 10 walk_length = 80 window_size = 10 num_iter = 1 workers = multiprocessing.cpu_count() graph = StellarGraph(graph) rw = BiasedRandomWalk(graph) walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q) print(f"Number of random walks: {len(walks)}") model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=num_iter) features = pd.DataFrame(data=model.wv.vectors, index=list(graph.nodes())) features.index = features.index.map(str) return features
def graph_embed(): combine = get_combine() li = ['bank', 'acquirer', 'coin', 'mcc', 'shop', 'nation', 'city'] d = { 'bank': 'b', 'mcc': 'm', 'acquirer': 'a', 'coin': 'c', 'shop': 's', 'nation': 'n', 'city': 'z' } have_df = False df_all = None for col_a in li: combine[col_a] = combine[col_a].astype(str) + [d[col_a]] for index, col_a in enumerate(li[1:]): print(f'{col_a} started..') walk_all = [] for day in np.linspace(1, 120, 120): print(day, end=',', flush=True) df = combine[combine['date'] == day] G = construct_graph('bank', col_a, df) rw = BiasedRandomWalk(StellarGraph(G)) walk = rw.run( nodes=list(G.nodes()), # root nodes length=80, # maximum length of a random walk n=1, # number of random walks per root node p=1, # Defines (unormalised) probability, 1/p, of returning to source node q=1, # Defines (unormalised) probability, 1/q, for moving away from source node ) walk_all.extend(walk) del df, G, rw, walk gc.collect() model = Word2Vec(walk_all, size=5, window=3, min_count=1, sg=0, workers=16, iter=10) temp_d = {} for w in list(model.wv.vocab): temp_d[w] = model[w] temp_df = pd.DataFrame( data=combine[col_a].map(temp_d).tolist(), columns=['embed_bank_' + col_a + str(x + 1) for x in range(5)]) if (have_df): df_all = pd.concat([df_all, temp_df], axis=1) else: df_all = temp_df have_df = True del temp_d, model gc.collect() return df_all
def _fit_node2vec(train_graph, params, edge_weight=None): rw = BiasedRandomWalk(train_graph) walks = rw.run( nodes=list(train_graph.nodes()), # root nodes length=params["length"], n=params["number_of_walks"], p=params["random_walk_p"], q=params["random_walk_q"], weighted=edge_weight is not None ) model = Word2Vec(walks, size=params["embedding_dimension"]) return model.wv[train_graph.nodes()]
def walks(self,walklen): G=nx.Graph(); G=nx.read_weighted_edgelist(self.dataset+"/krnmdata1/teamsG.txt") rw = BiasedRandomWalk(StellarGraph(G)) weighted_walks = rw.run( nodes=G.nodes(), # root nodes length=walklen, # maximum length of a random walk n=5, # number of random walks per root node p=0.1, # Defines (unormalised) probability, 1/p, of returning to source node q=2.0, # Defines (unormalised) probability, 1/q, for moving away from source node weighted=True, #for weighted random walks seed=42 # random seed fixed for reproducibility ) print("Number of random walks: {}".format(len(weighted_walks))) print(weighted_walks[0:10]) return weighted_walks
def node2vec_embedding(graph, name): rw = BiasedRandomWalk(graph) walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q) print(f"Number of random walks for '{name}': {len(walks)}") model = Word2Vec( walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=num_iter, ) def get_embedding(u): return model.wv[u] return get_embedding
def node2vec_walk(G, params): """Performs biased random walks using StellarGraph to generate corpus used in node2vec and writes corpus to a txt file :param G : StellarGraph graph Nodes consist of apps, api calls, packages, and invoke methods :param params : dict dict["key"] where dict is global parameter dictionary and key returns node2vec parameter sub-dictionary """ start_walks = time.time() print("Starting Random Walks") rw = BiasedRandomWalk(G) fp = os.path.join(params["save_dir"], params["filename"]) os.makedirs(params["save_dir"], exist_ok=True) walks = rw.run( nodes=list(G.nodes(node_type="app_nodes")), # root nodes length=params["length"], # maximum length of a random walk n=params["n"], # number of random walks per root node p=params["p"], # Defines prob, 1/p, of returning to source node q=params["q"], # Defines prob, 1/q, for moving away from source node ) print("--- Done Walking in " + str(int(time.time() - start_walks)) + " Seconds ---") print() print("Number of random walks: {}".format(len(walks))) # save walks to file with open(fp, 'w') as f: for walk in walks: for node in walk: f.write(str(node) + ' ') f.write('\n') f.close() if params["verbose"]: print("Saved %s to %s" % (params["filename"], params["save_dir"])) return
def get_node_feats(adj): # input is cur_adj edgelist = adj['idx'].cpu().data.numpy() source = edgelist[:, 0] target = edgelist[:, 1] weight = np.ones(len(source)) G = pd.DataFrame({ 'source': source, 'target': target, 'weight': weight }) G = StellarGraph(edges=G) rw = BiasedRandomWalk(G) weighted_walks = rw.run( nodes=list(G.nodes()), # root nodes length=2, # maximum length of a random walk n=5, # number of random walks per root node p=1, # Defines (unormalised) probability, 1/p, of returning to source node q=0.5, # Defines (unormalised) probability, 1/q, for moving away from source node weighted=True, # for weighted random walks seed=42, # random seed fixed for reproducibility ) str_walks = [[str(n) for n in walk] for walk in weighted_walks] weighted_model = Word2Vec(str_walks, size=self.feats_per_node, window=5, min_count=0, sg=1, workers=1, iter=1) # Retrieve node embeddings and corresponding subjects node_ids = weighted_model.wv.index2word # list of node IDs # change to integer for i in range(0, len(node_ids)): node_ids[i] = int(node_ids[i]) weighted_node_embeddings = ( weighted_model.wv.vectors ) # numpy.ndarray of size number of nodes times embeddings dimensionality # create dic dic = dict(zip(node_ids, weighted_node_embeddings.tolist())) # ascending order dic = dict(sorted(dic.items())) # create matrix adj_mat = sp.lil_matrix((self.data.num_nodes, self.feats_per_node)) for row_idx in node_ids: adj_mat[row_idx, :] = dic[row_idx] adj_mat = adj_mat.tocsr() adj_mat = adj_mat.tocoo() coords = np.vstack((adj_mat.row, adj_mat.col)).transpose() values = adj_mat.data row = list(coords[:, 0]) col = list(coords[:, 1]) indexx = torch.LongTensor([row, col]) tensor_size = torch.Size( [self.data.num_nodes, self.feats_per_node]) degs_out = torch.sparse.FloatTensor(indexx, torch.FloatTensor(values), tensor_size) hot_1 = { 'idx': degs_out._indices().t(), 'vals': degs_out._values() } return hot_1
def node2vec(): print('Training Node2Vec mode!') # initialize results arrays total_mse = np.zeros(args.exp_number) total_pcc = np.zeros(args.exp_number) total_mae = np.zeros(args.exp_number) mse_datasets = {} std_datasets = {} pcc_datasets = {} pcc_std_datasets = {} mae_datasets = {} mae_std_datasets = {} t_total = time.time() if args.dataset == 'all': datasets = [ 'airport', 'collaboration', 'congress', 'forum', 'geom', 'astro' ] else: datasets = [args.dataset] for dataset in datasets: for exp_number in range(args.exp_number): print("%s: experiment number %d" % (dataset, exp_number + 1)) data = preprocess_dataset.clean_data(dataset) if dataset != 'usair': data['weights'] = preprocessing.normalize([data['weights']])[0] # random split of data data_train, data_test = train_test_split(data, test_size=0.2) data_train, data_val = train_test_split(data_train, test_size=0.08) data = data.reset_index() data_train = data_train.reset_index() data_val = data_val.reset_index() data_test = data_test.reset_index() G = preprocess_dataset.create_graph_gcn(dataset, data, data_train) val_G = preprocess_dataset.create_graph_gcn( dataset, data, data_val) test_G = preprocess_dataset.create_graph_gcn( dataset, data, data_test) nodes_len = len(G.nodes) node_ids_to_index = {} for i, node_id in enumerate(G.nodes): node_ids_to_index[node_id] = i train_A = nx.adjacency_matrix(G) val_A = nx.adjacency_matrix(val_G) test_A = nx.adjacency_matrix(test_G) train_labels = torch.FloatTensor( data_train['weights'].values).cuda() val_labels = torch.FloatTensor(data_val['weights'].values).cuda() test_labels = torch.FloatTensor(data_test['weights'].values).cuda() train_A = sparse_mx_to_torch_sparse_tensor(train_A).cuda() val_A = sparse_mx_to_torch_sparse_tensor(val_A).cuda() test_A = sparse_mx_to_torch_sparse_tensor(test_A).cuda() G = sg.from_networkx(G) rw = BiasedRandomWalk(G) weighted_walks = rw.run( nodes=G.nodes(), # root nodes length=args.length, # maximum length of a random walk n=args.n_size, # number of random walks per root node p=args. p, # Defines (unormalised) probability, 1/p, of returning to source node q=args. q, # Defines (unormalised) probability, 1/q, for moving away from source node weighted=True, # for weighted random walks seed=42, # random seed fixed for reproducibility ) print("Number of random walks: {}".format(len(weighted_walks))) weighted_model = Word2Vec(weighted_walks, vector_size=args.vector_size, window=5, min_count=0, sg=1, workers=4) weights = torch.FloatTensor(weighted_model.wv.vectors).cuda() ######################################## train_n1 = torch.tensor(data_train['A'].values).cuda() train_n2 = torch.tensor(data_train['B'].values).cuda() train_n1_indices = torch.ones(train_n1.shape[0]) for i, value in enumerate(train_n1): train_n1_indices[i] = node_ids_to_index[value.item()] train_n1_indices = train_n1_indices.cuda().long() train_n2_indices = torch.ones(train_n1.shape[0]) for i, value in enumerate(train_n2): train_n2_indices[i] = node_ids_to_index[value.item()] train_n2_indices = train_n2_indices.cuda().long() ######################################## val_n1 = torch.tensor(data_val['A'].values).cuda() val_n2 = torch.tensor(data_val['B'].values).cuda() val_n1_indices = torch.ones(val_n1.shape[0]) for i, value in enumerate(val_n1): val_n1_indices[i] = node_ids_to_index[value.item()] val_n1_indices = val_n1_indices.cuda().long() val_n2_indices = torch.ones(val_n1.shape[0]) for i, value in enumerate(val_n2): val_n2_indices[i] = node_ids_to_index[value.item()] val_n2_indices = val_n2_indices.cuda().long() ######################################## test_n1 = torch.tensor(data_test['A'].values).cuda() test_n2 = torch.tensor(data_test['B'].values).cuda() test_n1_indices = torch.ones(test_n1.shape[0]) for i, value in enumerate(test_n1): test_n1_indices[i] = node_ids_to_index[value.item()] test_n1_indices = test_n1_indices.cuda().long() test_n2_indices = torch.ones(test_n1.shape[0]) for i, value in enumerate(test_n2): test_n2_indices[i] = node_ids_to_index[value.item()] test_n2_indices = test_n2_indices.cuda().long() ######################################## model = Node2Vec(weights, 0.5) optimizer = optim.Adam(model.parameters(), lr=args.lr) model.train() model = model.to(args.device) # train for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() output = model(train_n1_indices, train_n2_indices) loss_train = F.mse_loss(output, train_labels) loss_train.backward() optimizer.step() # validation model.eval() output = model(val_n1_indices, val_n2_indices) loss_val = F.mse_loss(output, val_labels) if args.verbose: print('Epoch: {:04d}'.format(epoch + 1), 'loss_train: {:.4f}'.format(loss_train.item()), 'loss_val: {:.4f}'.format(loss_val.item()), 'time: {:.4f}s'.format(time.time() - t)) # test model.eval() with torch.no_grad(): output = model(test_n1_indices, test_n2_indices) loss_test = F.mse_loss(torch.flatten(output), test_labels) pcc_test = pearson_correlation(test_labels, output) mae_test = F.l1_loss(output, test_labels) print("Test set results:", "loss= {:.10f}".format(loss_test.item()), "pcc= {:.10f}".format(pcc_test), "mae= {:.10f}".format(mae_test.item())) total_mse[exp_number] = loss_test total_pcc[exp_number] = pcc_test total_mae[exp_number] = mae_test # results mse_datasets[dataset] = np.mean(total_mse) std_datasets[dataset] = np.std(total_mse) total_mse = np.zeros(args.exp_number) pcc_datasets[dataset] = np.mean(total_pcc[~np.isnan(total_pcc)]) pcc_std_datasets[dataset] = np.std(total_pcc[~np.isnan(total_pcc)]) total_pcc = np.zeros(args.exp_number) mae_datasets[dataset] = np.mean(total_mae) mae_std_datasets[dataset] = np.std(total_mae) total_mae = np.zeros(args.exp_number) for dataset in datasets: print("MSE %s: {:,f}".format(mse_datasets[dataset]) % dataset) print("MSE_STD %s: {:,f}".format(std_datasets[dataset]) % dataset) print("PCC %s: {:,f}".format(pcc_datasets[dataset]) % dataset) print("PCC_STD %s: {:,f}".format(pcc_std_datasets[dataset]) % dataset) print("MAE %s: {:,f}".format(mae_datasets[dataset]) % dataset) print("MAE_STD %s: {:,f}".format(mae_std_datasets[dataset]) % dataset) print("Total time elapsed: {:.4f}s".format(time.time() - t_total)) exit()
save_path = project_folder + "/" + use_model_type + "_" + uni_name + ".csv" G_graphml = nx.read_graphml(load_path) # get the node features as a dataframe, these will then be added to the stellar graph. # This seems to work better than trying to put them in directly # nodefeatures = pd.DataFrame.from_dict(dict(G_graphml.nodes(data=True)), orient='index') # print(nodefeatures) # Convert the networkx graph to a Stellargraph G = StellarGraph.from_networkx(G_graphml) rw = BiasedRandomWalk(G) walks = rw.run( nodes=list(G.nodes()), # root nodes length=30, # maximum length of a random walk n=100, # number of random walks per root node p=0.5, # Defines (unormalised) probability, 1/p, of returning to source node q=2.0, # Defines (unormalised) probability, 1/q, for moving away from source node ) print("Number of random walks: {}".format(len(walks))) str_walks = [[str(n) for n in walk] for walk in walks] model = Word2Vec(str_walks, size=dims, window=10, min_count=0, sg=1, workers=1, iter=1) model.wv.save_word2vec_format(save_path)