def get_additional_features(config, edge_index, edge_attr, args): data = torch.sparse.FloatTensor(edge_index, edge_attr.squeeze(1)) features = [[] for i in range(config['n_vertex'])] # for i in range(data.size(0)): # features.append(data[i].to_dense().cpu().tolist()) if args.node2vec: cache_path = os.path.join(DATASET_DIR, args.data, 'embedding.pt') if not os.path.exists(cache_path) or args.overwrite_cache: nx_G = node2vec.read_graph(config, edge_index, edge_attr) G = node2vec.Node2Vec(nx_G, True, 1., 1., args.verbose) G.preprocess_transition_probs() walks = G.simulate_walks(40, 10) embedding = node2vec.learn_embeddings(walks) embeddings = [] for i in range(config['n_vertex']): embeddings.append(embedding.wv[str(i)].tolist()) torch.save(embeddings, cache_path) else: embeddings = torch.load(cache_path) for i in range(config['n_vertex']): features[i] += embeddings[i] return torch.tensor(features).float()
def get_walks(self, walk_length, num_walks_per_node, p, q, workers, precomputed): if precomputed: self.load_walks() else: self.walks = node2vec.Node2Vec(self.graph, walk_length=walk_length, num_walks=num_walks_per_node, p=p, q=q, workers=workers).walks self.save_walks()
def generate_cluster_map(self): n2v = node2vec.Node2Vec(self.graph, dimensions=64, walk_length=30, num_walks=200, workers=5) model = n2v.fit(window=10, min_count=1, batch_words=4) X = [] for i in range(len(self.graph)): X.append(model.wv[str(i)]) kmeans = KMeans(n_clusters=self.action_space, random_state=0).fit(X) self.cluster_map = {} for node in range(len(self.graph)): self.cluster_map[node] = kmeans.labels_[node]
def run_node2vec(graph, save_path): """ Runs the node2vec method from Node2Vec on a given graph, saves it as a pickle Parameters: graph (Networkx graph): NetworkX graph objects save_path (filepath): Filepath for where to save the pickled model """ #Parameter p is the propability of revisitting a node you have just seen, #a high value means we are less likely to backtract to it #Parameter q makes the random walk more biased towards nodes close to our starting node, # a high value makes it stay close to out start node graphn2v = n2v.Node2Vec(graph, dimensions=50, walk_length=40, num_walks=50, p=1, q=2, workers=1) n2vmodel = graphn2v.fit(window=10, min_count=5) pickle.dump(n2vmodel, open(save_path, "wb"))
def main(): nod = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 ] edg = [(1, 2), (1, 3), (2, 6), (3, 4), (3, 12), (4, 5), (4, 11), (5, 6), (5, 9), (6, 8), (7, 8), (7, 18), (8, 9), (8, 16), (9, 10), (10, 11), (10, 15), (10, 16), (10, 17), (11, 12), (11, 14), (12, 13), (13, 24), (14, 15), (14, 23), (15, 19), (15, 22), (16, 17), (16, 18), (17, 19), (18, 20), (19, 20), (20, 21), (20, 22), (21, 22), (21, 24), (22, 23), (23, 24)] G1 = nx.Graph() G1.add_nodes_from(nod) G1.add_edges_from(edg) for k in range(1000, 2000): for m in range(22, 1, -1): print(time.time()) initil = node2vec.Node2Vec(G1, dimensions=m, walk_length=50, num_walks=60, p=2, q=0.5) print(time.time()) model = initil.fit() print(model.wv.vectors) phi = [] phi.append([]) for i in range(24): phi_i = model.wv.get_vector(str(i + 1)).tolist() phi_i.append(1) phi.append(phi_i) np.save(file="/Users/pqh/Desktop/route/Sioux/Sioux_d" + str(m) + "_" + str(k) + "_phi.npy", arr=phi) print(time.time())
def enclosed_subgraph(g, hop=1, max_hop_nodes=None, link_percent=1., embedding_dim=0, has_feature=False, inject_neg_links=True, multi_process=False): """ 抽取封闭子图 Args: g: networkx Graph hop: 最大hop max_hop_nodes: 每个hop中,最大节点数量 link_percent: g 中用于预测的边的比例 has_feature: 是否使用节点特征,如果为True则每个节点有属性"has_feature" embedding: 是否使用node2vec生成每个节点的embedding multi_process: 是否使用多进程(单进程,速度慢,可调试; 多进程,速度快,不可调试,Windows下不可用) Return: 正例子图列表, 负例子图列表 """ # 正例链接 pos_links = list(g.edges) pos_links = rand.sample(pos_links, int(g.number_of_edges() * link_percent)) num_pos_links = len(pos_links) # 负例链接 neg_links = [] nodes = list(g.nodes) i = 0 while True: node1 = rand.choice(nodes) node2 = rand.choice(nodes) if node1 == node2 or g.has_edge(node1, node2): continue neg_links.append((node1, node2)) i += 1 if i >= num_pos_links: break # 加入节点node2vec embedding if embedding_dim > 0: print("node2vec embedding ... ...") if inject_neg_links: # 是否加入neg_links g.add_edges_from(neg_links) n2v_model = nv.Node2Vec(g, dimensions=embedding_dim, walk_length=30, num_walks=10, workers=4) n2v_wv = n2v_model.fit().wv nv_dict = { int(n): v for n, v in zip(n2v_wv.index2word, n2v_wv.vectors) } if not has_feature: nx.set_node_attributes(g, nv_dict, 'feature') else: feat_dict = nx.get_node_attributes(G, 'feature') features = { n: np.concatenate([feat_dict[n], nv_dict[n]]) for n in feat_dict.keys() } nx.set_node_attributes(g, features, 'feature') if inject_neg_links: g.remove_edges_from(neg_links) # 抽取封闭子图 pos_sub_gs = extract_subgraph_from_links(g, pos_links, hop, max_hop_nodes, multi_process, info='pos') neg_sub_gs = extract_subgraph_from_links(g, neg_links, hop, max_hop_nodes, multi_process, info='neg') # 处理子图的 Double-Radius Node Label dr_label_set = set() for sg in pos_sub_gs: sg.label = [1, 0] dr_label_set = dr_label_set.union( set(nx.get_node_attributes(sg, 'dr_label').values())) for sg in neg_sub_gs: sg.label = [0, 1] dr_label_set = dr_label_set.union( set(nx.get_node_attributes(sg, 'dr_label').values())) dr_label_dict = {v: i for i, v in enumerate(list(dr_label_set))} dr_label_dim = len(dr_label_set) # 在节点特征中加入 Double-Radius Node for gs in pos_sub_gs: for n in gs.nodes: dr_l = gs.nodes[n]['dr_label'] if has_feature or embedding_dim > 0: feat = gs.nodes[n]['feature'] gs.nodes[n]['feature'] = np.concatenate( [feat, onehot(dr_label_dict[dr_l], dr_label_dim)]).astype(np.float32) else: gs.nodes[n]['feature'] = np.array( onehot(dr_label_dict[dr_l], dr_label_dim)).astype(np.float32) for gs in neg_sub_gs: for n in gs.nodes: dr_l = gs.nodes[n]['dr_label'] if has_feature or embedding_dim > 0: feat = gs.nodes[n]['feature'] gs.nodes[n]['feature'] = np.concatenate( [feat, onehot(dr_label_dict[dr_l], dr_label_dim)]).astype(np.float32) else: gs.nodes[n]['feature'] = np.array( onehot(dr_label_dict[dr_l], dr_label_dim)).astype(np.float32) return pos_sub_gs, neg_sub_gs
def get_feature_extractor(network, features): """ Get function that extracts specified features from a pair of nodes. Args: network (object): Networkx representation of the network. features (list): List of names of features to extract. Returns: (function): Function that takes a network and two nodes (node pair) and computes the specified features in the form of a numpy array. """ def get_feature(network, n1, n2, feature): """ Get specified feature for pair of nodes n1 and n2. This function is used by the get_feature_extractor function. Args: network (object): Networkx representation of the network. n1 (str): First node in pair. n2 (str): Second node in pair. feature (str): Name of feature to extract. Returns: (float): The extracted feature. """ # Extract specified feature. if feature == 'common-neighbors': # Return number of common neighbors. return len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) elif feature == 'jaccard-coefficient': # Return Jaccard coefficient for the node pair. size_int = len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) size_un = len( set(network.neighbors(n1)).union(network.neighbors(n2))) return size_int / size_un if size_un > 0.0 else 0.0 elif feature == 'hub-promoted': # Return Hub-promoted index. size_int = len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) denom = min(len(set(network.neighbors(n1))), len(set(network.neighbors(n1)))) if denom > 0: return size_int / denom else: return 0 elif feature == 'adamic-adar': # Compute and return Adamic-Adar index. return np.sum([ 1 / np.log(len(set(network.neighbors(n)))) for n in set( network.neighbors(n1)).intersection(network.neighbors(n2)) if len(set(network.neighbors(n))) > 1 ]) elif feature == 'resource-allocation': # Compute and return resource-allocation index. return np.sum([ 1 / len(set(network.neighbors(n))) for n in set( network.neighbors(n1)).intersection(network.neighbors(n2)) if len(set(network.neighbors(n))) > 0 ]) elif feature == 'sorenson': # Compute and return Sorenson index. size_int = len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) denom = len(set(network.neighbors(n1))) + len( set(network.neighbors(n1))) return size_int / denom if denom > 0.0 else 0.0 elif feature == 'hub-depressed': # Return Hub-depressed index. size_int = len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) denom = max(len(set(network.neighbors(n1))), len(set(network.neighbors(n1)))) if denom > 0: return size_int / denom else: return 0 elif feature == 'salton': # Compute and return Salton index. size_int = len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) denom = np.sqrt( len(set(network.neighbors(n1))) * len(set(network.neighbors(n1)))) return size_int / denom if denom > 0.0 else 0.0 elif feature == 'leicht-holme-nerman': # Compute and return Leicht-Holme-Nerman index. size_int = len( set(network.neighbors(n1)).intersection(network.neighbors(n2))) denom = len(set(network.neighbors(n1))) * len( set(network.neighbors(n1))) return size_int / denom if denom > 0.0 else 0.0 elif feature == 'preferential-attachment': # Compute and return preferential-attachment index. return len(set(network.neighbors(n1))) * len( set(network.neighbors(n2))) elif feature == 'local-random-walk': # Compute Local random walk score. return local_random_walk(network, n1, n2, p_tran) elif feature == 'superposed-random-walk': # Compute Local random walk score. return sum( [local_random_walk(network, n1, n2, p_tran) for _ in range(5)]) elif feature == 'simrank': # Return Simrank score. return simrank_scores[n1][n2] elif feature == 'same-community': # Return flag specifying whether the two nodes are part of # the same community or not. return int(communities[n1] == communities[n2]) elif feature == 'community-index': # If nodes not part of same community, return 0. if communities[n1] != communities[n2]: return 0 else: # Get community index of both nodes. communitiy_idx = communities[n1] # Compute community index. return m_counts[communitiy_idx] / comb( n_counts[communitiy_idx], 2) elif feature == 'page-rank': # Compare PageRank scores of the nodes. return abs(page_rank[n1] - page_rank[n2]) elif feature == 'node2vec': # Return cosine distance between embeddings (or concatenate embeddings). return np.hstack((n2v_model.wv[str(n1)], n2v_model.wv[str(n1)])) # return spatial.distance.cosine(n2v_model.wv[str(n1)], n2v_model.wv[str(n1)]) elif feature == 'random': # Return random value as feature. return np.random.rand() else: raise ValueError('Unknown feature ' + feature) def feature_extractor(network, n1, n2, features): """ The feature extractor function. This function is partially applied with the list of features and returned by the get_feature_extractor function. Args: network (object): Networkx representation of the network. n1 (str): First node in pair. n2 (str): Second node in pair. features (list): List of names of features to extract. """ return np.hstack( [get_feature(network, n1, n2, feature) for feature in features]) ### PRECOMPUTED DATA FOR WHOLE NETWORK (NEEDED FOR SOME MEASURES) ### if 'simrank' in features: # Compute simrank scores. simrank_scores = nx.algorithms.similarity.simrank_similarity(network) if 'local-random-walk' in features or 'superposed-random-walk' in features: # Get adjacency matrix and compute probabilities of transitions. adj = nx.to_scipy_sparse_matrix(network) p_tran = sklearn.preprocessing.normalize(adj, norm='l1', axis=0) if 'same-community' in features or 'community-index' in features: # Get communities. communities = community.best_partition(network, randomize=True) # Initialize dictionary mapping community indices to counts of links contained within them. m_counts = dict.fromkeys(set(communities.values()), 0) # Count number of nodes in each community. n_counts = Counter(communities.values()) # Go over links in network. for edge in network.edges(): # If link within community, add to accumulator for that community. if communities[edge[0]] == communities[edge[1]]: m_counts[communities[edge[0]]] += 1 if 'page-rank' in features: # Compute PageRank of nodes page_rank = nx.pagerank(network) if 'node2vec' in features: import node2vec n2v = node2vec.Node2Vec(network, dimensions=64, walk_length=30, num_walks=20, workers=8) n2v_model = n2v.fit(window=10, min_count=1, batch_words=4) ##################################################################### return ( lambda network, n1, n2: feature_extractor(network, n1, n2, features))
parser.add_argument('--min-count',type=int,default=0, help='Number of min count of word. Default is 0.') parser.add_argument('--sg',type=int,default = 1, help='Skip-gram/Cbow,0 is cbow and 1 is sg. Default is 1') parser.add_argument('--hs',type=int,default = 1, help='use Hierarchical Softmax or not, 1 yes, 0 no. Default is 1') parser.set_defaults(directed=False) return parser.parse_args() if __name__ == "__main__": args = parse_args() cora_edge = pd.read_table(cora_address+cora_cite,sep='\t',names = ['src','dst']) G = nx.Graph() node_dict = {i:v for v,i in enumerate(set(np.append(cora_edge.src.values,cora_edge.dst.values)))} cora_edge['src'] = cora_edge['src'].apply(lambda x: str(node_dict[x])) cora_edge['dst'] = cora_edge['dst'].apply(lambda x: str(node_dict[x])) cora_edge_list = cora_edge.values.tolist() def map_func(x): return (x[0],x[1],{'weight':1}) cora_edge_list = list(map(map_func,cora_edge_list)) G.add_edges_from(cora_edge_list) startTime = time.perf_counter() n2v = node2vec.Node2Vec(G,args.p,args.q) n2v.train(num_walks = args.num_walks ,walk_length = args.walk_length, embed_size = args.embed_size, window_size = args.window_size, workers = args.workers, iter_num = args.iter, min_count = args.min_count, sg = args.sg, hs = args.hs) endTime = time.perf_counter() print('epoch {}: Dimension from {} to {} use {} s'.format(args.iter,len(node_dict),args.embed_size,endTime-startTime))
def node2vec_cora(): print("NODE2VEC") X, A, y = data.load_data(dataset='cora') node2vec = nv.Node2Vec(A) return node2vec.train(y)