def Node2Vec_model(g): large_graph = False temp_folder = "/tmp/node2vec" # embedding parameters dimensions = 128 window_size = 10 # performance parameters num_threads = 4 num_walks = 200 # Precompute probabilities and generate walks print("Generating walks on the network ...") ## if d_graph is too big to fit in the memory, pass temp_folder which has enough disk space if large_graph: # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs node2vec = Node2Vec(g, dimensions=dimensions, walk_length=30, num_walks=num_walks, workers=num_threads, temp_folder=temp_folder) else: node2vec = Node2Vec(g, dimensions=dimensions, walk_length=30, num_walks=num_walks, workers=num_threads) # Embed print("Building the embedding (%d dimensions) ..." % dimensions) model = node2vec.fit(window=window_size, min_count=1, batch_words=4) return model
def __init__(self, dataset, p=1, q=4, walk_length=100, num_walks=50, dimensions=200, window_size=30, workers=8, iterations=5): Node2Vec.__init__(self, False, True, False, p, q, walk_length, num_walks, dimensions, window_size, workers, iterations) self.dataset = dataset file = 'num%d_p%d_q%d_l%d_d%d_iter%d_winsize%d.emd' % ( num_walks, p, q, walk_length, dimensions, iterations, window_size) self.path = 'datasets/%s/node2vec/' % self.dataset + file if file not in os.listdir('datasets/%s/node2vec/' % self.dataset): self.run('datasets/%s/node2vec/altogether.edgelist' % self.dataset, self.path) self.node2vec_model = KeyedVectors.load_word2vec_format(self.path, binary=True)
def __init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions, window_size, workers, iterations, feedback_file): Node2Vec.__init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions, window_size, workers, iterations) self.feedback_file = feedback_file
def fit(self, dataframe): edges = dataframe.groupby( self.categorical_columns, ).size().reset_index().dropna() G = nx.DiGraph() G.add_weighted_edges_from(edges.values) node2vec = Node2Vec( G, dimensions=self.n_components, walk_length=self.walk_length, num_walks=self.num_walks, workers=self.workers, ) self.model = node2vec.fit( window=self.window, min_count=self.min_count, batch_words=self.batch_words, ) self.feature = pd.DataFrame( {key: self.model.wv[key] for key in self.model.wv.vocab}).T.reset_index() self.feature.columns = self.categorical_columns[:1] + [ f'{self.name}_{i:03}' for i in range(self.n_components) ] self.features = [self.feature] return self
def naiveGraphEmbeddingAllStudentsInAWeek( transitionDataMatrix_directFollow_week, activityCodeList, w): result = [] dimensions = 64 for i in transitionDataMatrix_directFollow_week.index: print(f'Week {w} - student: {i}') b = transitionDataMatrix_directFollow_week.loc[i, :] graph = graphCreationForSingleStudent(b, activityCodeList) if len(graph._node) > 0: # continue node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=10, num_walks=100) model = node2vec.fit(window=10, min_count=1) node_embeddings = ( model.wv.vectors ) # numpy.ndarray of size number of nodes times embeddings dimensionality result.append(node_embeddings.sum(axis=0)) else: result.append(np.zeros(64)) return pd.DataFrame(result, index=transitionDataMatrix_directFollow_week.index)
def node2vec_classification(G, clusters, dim=128, walk_length=80, num_walks=10, return_=1, inout=1): node2vec = Node2Vec(G, dimensions=dim, walk_length=walk_length, num_walks=num_walks, p=return_, q=inout) model = node2vec.fit(window=10, min_count=1, batch_words=4) word_vector_matrix = np.vstack([model.wv[node] for node in list(G)]) kmeans = KMeans(n_clusters=clusters, random_state=0).fit(word_vector_matrix) labels = kmeans.labels_ node_labels = zip(list(G), labels) clusters = {} for pair in node_labels: if pair[1] in clusters.keys(): clusters[pair[1]].append(pair[0]) else: clusters[pair[1]] = [pair[0]] return clusters, word_vector_matrix
def create_node2vec_embeddings(graph): multi_gnx = graph if os.path.exists('pkl/Node2Vec_embedding_new.pickle'): with open('pkl/Node2Vec_embedding_new.pickle', 'rb') as handle: dict_embeddings = pickle.load(handle) elif os.path.exists('pkl/Node2Vec_embedding_new.csv'): embedding_df = pd.read_csv('pkl/Node2Vec_embedding_new.csv') dict_embeddings = embedding_df.to_dict(orient='list') with open('pkl/Node2Vec_embedding_new.pickle', 'wb') as handle: pickle.dump(dict_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL) else: node2vec = Node2Vec(multi_gnx, dimensions=16, walk_length=30, num_walks=200, workers=1) model = node2vec.fit() nodes = list(multi_gnx.nodes()) dict_embeddings = {} for i in range(len(nodes)): dict_embeddings.update( {nodes[i]: np.asarray(model.wv.get_vector(nodes[i]))}) with open('pkl/Node2Vec_embedding_new.pickle', 'wb') as handle: pickle.dump(dict_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL) return dict_embeddings
def test_precompute_probs(self): """Test the pre-compute_probs function.""" g1 = nx.read_weighted_edgelist(path=WEIGHTED_NETWORK_PATH, nodetype=int) n1 = Node2Vec(g1) d1 = n1._precompute_probabilities() g2 = get_test_network(WEIGHTED_NETWORK_PATH) random_walk_parameters = WalkerParameters( number_paths=5, max_path_length=10, ) word2vec_parameters = Word2VecParameters() n2 = Node2VecModel(g2, random_walk_parameters, word2vec_parameters) for key in d1.keys(): vertex1 = d1[key] vertex2 = n2.graph.vs.find(name=str(key)) self.assertListEqual( sorted(vertex1['neighbors']), sorted([int(nbr['name']) for nbr in vertex2.neighbors()])) self.assertListEqual(list(vertex1['first_travel_key']), list(vertex2['first_travel_key'])) for inner_key in vertex1['probabilities'].keys(): self.assertListEqual( list(vertex1['probabilities'][inner_key]), list(vertex2['probabilities'][str(inner_key)]))
def node_embeddings(G, f, dim=20, walk_length=16, num_walks=100, workers=2): """ Adds the embeddings of the nodes to the dataframe f. G: a networkx graph. f: a pandas dataframe. dim: the dimension of the embedding. Grover, A., & Leskovec, J. (2016, August). node2vec: Scalable feature learning for networks. In Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 855-864). ACM. """ if not (set(f.name) == set(G.nodes()) and len(f.name) == len(G.nodes())): raise ValueError( 'The number of nodes and the length of the datadrame should be the same.' ) from node2vec import Node2Vec node2vec = Node2Vec(G, dimensions=dim, walk_length=walk_length, num_walks=num_walks, workers=workers) model = node2vec.fit(window=10, min_count=1) embeddings_df = pd.DataFrame( columns=['name'] + ['node_embeddings_' + str(i) for i in range(dim)]) embeddings_df['name'] = f['name'] for name in embeddings_df['name']: embeddings_df[embeddings_df['name'] == name] = [name] + list( model[str(name)]) f = pd.merge(f, embeddings_df, on='name') return f
def main(): workflow = 'workflows/workflow_one.ros' args = { "disease_name": "type 2 diabetes mellitus", } libpath = ['workflows'] """ general. """ ros = Client(url="http://localhost:5002") response = ros.run(workflow=workflow, args=args, library_path=libpath) print(json.dumps(response.result, indent=2)) graph = response.to_nx() for n in graph.nodes(data=True): print(n) n2v = Node2Vec(graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight', workers=1, sampling_strategy=None, quiet=False) model = n2v.fit()
def train_embeddings(edgelist_path, embedding_path): # Create path graph = nx.read_weighted_edgelist(edgelist_path) logger.info('Graph created!') assert graph.get_edge_data( '0000013714', '0005064295')['weight'] == 3.2, 'Expected edge weight of 3.2' # Precomput probabilities and generate walks node2vec = Node2Vec(graph, dimensions=128, walk_length=30, num_walks=10, workers=10, temp_folder=DATA_PATH) logger.info('Computed probabilities and generated walks') graph = None # We don't need graph anymore since probabilities have been precomputed # Embed nodes model = node2vec.fit(window=5, min_count=1, batch_words=128) logger.info('Nodes embedded') # Save embeddings for later use model.wv.save_word2vec_format(embedding_path) logger.info('Embedding saved')
def train(filename): # Create a graph graph = make_graph(filename) print('number of nodes: ', nx.number_of_nodes(graph)) # H=nx.DiGraph(G) # create a DiGraph using the connections from G # H.edges() # edgelist=[(0,1),(1,2),(2,3)] # H=nx.Graph(edgelist) # Precompute probabilities and generate walks node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) # Embed model = node2vec.fit( window=10, min_count=1, batch_words=4 ) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) # Look for most similar nodes model.wv.most_similar('2') # Output node names are always strings # Save embeddings for later use model.wv.save('embeddings2.bin') # Save model for later use model.save('ex_model')
def adjacency_matrix_to_train_set(g: nx.Graph, depth: int = 3) -> pd.DataFrame: """ Transforms adjacency matrix of a graph into a training set for ML model :param g: input graph :param depth: max length of paths considered when generating training set :return dataframe with nodes, their embeddings, and their similarity """ alpha = 10 result = [] model = Node2Vec(g).fit() A = nx.adjacency_matrix(g).todense() AA = A for i in range(depth): for (x, y), val in np.ndenumerate(AA): result.append((x, y, val * (1 / alpha**(i)))) AA = AA @ A df = pd.DataFrame(np.array(result), columns=['x', 'y', 'val']) dfg = df.groupby(['x', 'y'], as_index=False).sum() dfg['emb_x'] = dfg['x'].apply(lambda x: model.wv[str(int(x))]) dfg['emb_y'] = dfg['y'].apply(lambda y: model.wv[str(int(y))]) return dfg
def train(input_file, output_file): if len(sys.argv) < 2: print('insufficient arguments') exit() else: print('input file:', sys.argv[1], '\noutput file:', sys.argv[2]) # Create a graph print('reading edges...') g = nx.Graph() with open(sys.argv[1], 'r') as f: for line in f: line = line.split(' ') g.add_edge(line[0], line[1], weight=float(line[2])) if 'str' in line: break sub = g.subgraph(max(nx.connected_components(g), key=len)) print('number of nodes: ', nx.number_of_nodes(sub)) print('number of edges: ', nx.number_of_edges(sub)) # Precompute probabilities and generate walks node2vec = Node2Vec(sub, dimensions=256, walk_length=100, num_walks=20, workers=8, p=0.25, q=1) # np.save(output_file + 'walks', node2vec.walks) model = node2vec.fit( window=15, min_count=1, batch_words=4 ) # Any keywords acceptable by gesim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) model.wv.save_word2vec_format(output_file)
def emb_node2vec(g, s, dimension=32, walk_length=15, num_walks=100, window=10, save=False): """ Compute the node embedding using Node2Vec :param g: a graph :param s: protected attribute (vector) :param dimension: dimension of the embedding :param walk_length: length of the random walk :param num_walks: number of walks :param window: window :param save: if true save the node2vec model :return: the embedding matrix and the associate protected attribute """ node2vec = Node2Vec(g, dimensions=dimension, walk_length=walk_length, num_walks=num_walks) model = node2vec.fit(window=window, min_count=1) idx = list(map(int, model.wv.index_to_key)) emb_x = model.wv.vectors new_s = s[idx] if save: model.save('node2vec_model') return emb_x, new_s, model
def main(args): _, _, _, X_tr, Y_tr, V_tr, E_tr = get_x_y_v_e(args.filepath) print('Learning embeddings with {}...'.format(args.embedding)) if args.embedding == 'deepwalk': ## DeepWalk default values: number_walks=10,representation_size=64,seed=0,walk_length=40,window_size=5,workers=1 label_emb = DeepWalk().transform(E_tr, 'edgedict') elif args.embedding == 'node2vec': ## Node2Vec default values: num_walks=10,dimensions=64,walk_length=40,window_size=5,workers=1,p=1,q=1, ## weighted=False,directed=False,iter=1 label_emb = Node2Vec().transform(E_tr, 'edgedict') else: raise NotImplemented label_emb_wv = label_emb.wv print('Calling compare...') compare = Compare(args.labelfile, label_emb_wv) compare.invoke() import pdb pdb.set_trace() print('end')
def get_node2vec_embeddings(features: List[UserFeatures], hyper_params: dict): input_edge_list_file = '{}/node2vec_relations_edge_list_{}.txt'.format( TEMP_DIR, get_random_id()) output_embeddings_file = '{}/node2vec_relations_node2vec_embeddings_{}.txt'.format( TEMP_DIR, get_random_id()) if not os.path.exists(TEMP_DIR): os.makedirs(TEMP_DIR) create_edge_list_file(input_edge_list_file, features) graph = nx.Graph() with open(input_edge_list_file, 'r') as file_handler: for line in file_handler: if line: node1, node2 = line.split() graph.add_edge(node1, node2) node2vec = Node2Vec(graph, workers=4, temp_folder=TEMP_DIR, **hyper_params) model = node2vec.fit(window=5, min_count=1, batch_words=4) model.wv.save_word2vec_format(output_embeddings_file) embeddings = np.genfromtxt(output_embeddings_file, delimiter=' ', skip_header=1) embeddings_sorted = embeddings[embeddings[:, 0].argsort()] os.remove(input_edge_list_file) os.remove(output_embeddings_file) return embeddings_sorted[:len(features), 1:]
def run_node2vec(self): #if have a model already, skip rerunning if self.model != None: print("Reusing existing model") return True #verify that we have a graph if self.graph == None: print( "Must build graph and compute pagerank before inferring parameters" ) return False #precompute probabilities and generate walks print("Running node2vec...") node2vec = Node2Vec( self.graph, dimensions=16, walk_length=10, num_walks=200, workers=4, quiet=True ) #example uses 64 dimensions and walk_length 10, let's go smaller #compute embeddings - dimensions and workers automatically passed from the Node2Vec constructor self.model = node2vec.fit(window=10, min_count=1, batch_words=4) print("Done") return True
def run_node2vec_emb(data, G, embedding_model_file_path, enforce_end2end, add_qualified_edges, use_weighted_edges, edges_percent, edges_number, dim, walk_len, num_walks, window, added_edges_percent_of, emb_type, save_path): # dim = 1 # walk_len = 1 # num_walks =1 # window =1 node2vec = Node2Vec(G, weight_key='weight', dimensions=dim, walk_length=walk_len, num_walks=num_walks, workers=4) model = node2vec.fit(window=window, min_count=1, batch_words=4) # save model to model.wv.save_word2vec_format(save_path)
def main2(): args_list = CSVArgs('test.csv') workflow = 'workflows/m2m_models_v1.ros' libpath = ['workflows'] """ Build graph. """ g = nx.MultiDiGraph() for args in args_list.vals: ros = Client(url="http://localhost:5002") response = ros.run(workflow=workflow, args=args, library_path=libpath) print(json.dumps(response.result, indent=2)) response_nx = response.to_nx() print( f"read {len(response_nx.nodes())} nodes and {len(response_nx.edges())} edges." ) g = nx.compose(g, response.to_nx()) """ Calulate node embeddings. """ n2v = Node2Vec(g, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight', workers=1, sampling_strategy=None, quiet=False) model = n2v.fit()
def cluster(self, nodes, k=4): print('ori_node', len(nodes)) graph = self.sub_graph(nodes) print('graph_nodes', len(graph)) print('graph_edges', graph.size()) node2vec = Node2Vec(graph, dimensions=64, walk_length=80, num_walks=len(graph) * 2, workers=4) print('fitting') t1 = time.time() model = node2vec.fit(window=5, min_count=1, batch_words=4) print('fitting done, time:', time.time() - t1) num_clusters = k vecs = [] for node in nodes: vecs.append(model.wv[node]) km_cluster = KMeans(n_clusters=num_clusters) result = km_cluster.fit_predict(vecs) lists = [] for id, node in zip(result, nodes): lists.append([id, self.titles[int(node)]]) lists = sorted(lists, key=lambda x: x[0]) with open('result.txt', 'w') as f: for x in lists: print(x, file=f)
def save_node2vec_emb(G, save_path = f'data/gene_disease/{args.time_stamp}/processed/embedding/node2vec/', EMBEDDING_FILENAME = 'node2vec_emb.txt', log=True): print(f"save node2vec emb to {save_path + EMBEDDING_FILENAME}") try: with open(save_path + EMBEDDING_FILENAME, 'w') as f: pass except: os.makedirs(save_path, exist_ok=True) # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1** node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4) # Use temp_folder for big graphs # todo undirected_edges s = time.time() # Embed nodes model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) f = time.time() total = f-s print(f'total = {total}') output_path = save_path + EMBEDDING_FILENAME # Save embeddings for later use model.wv.save_word2vec_format(output_path) # # Save model for later use # model.save(output_path) if log: with open(f'./log/gene_disease/{EMBEDDING_FILENAME}', 'w') as f: f.write(f' --{save_path}{EMBEDDING_FILENAME}\n') f.write(f'total running time {total}')
def get_node_random_walk(x_list, adj_list, node2vec_hidden, walk_length, num_walks, p, q, workers): node_random_walk_list = [] for i, adj in enumerate(adj_list): if i % 15 == 0: print('node random walk ...', i, '/', len(adj_list)) walk_dic = {} if type(adj).__module__ == np.__name__: G = nx.Graph(adj) else: G = nx.Graph(adj.to('cpu').numpy()) node2vec = Node2Vec( graph= G, # The first positional argument has to be a networkx graph. Node names must be all integers or all strings. On the output model they will always be strings. dimensions=node2vec_hidden, # Embedding dimensions (default: 128) walk_length=walk_length, # number of nodes in each walks num_walks=2, # Number of walks per node (default: 10) p=p, # 전 꼭짓점으로 돌아올 가능성, 얼마나 주변을 잘 탐색하는가 q=q, # 전 꼭짓점으로부터 멀어질 가능성, 얼마나 새로운 곳을 잘 탐색하는가 weight_key= None, # On weighted graphs, this is the key for the weight attribute (default: 'weight') workers= workers, # Number of workers for parallel execution (default: 1) quiet=True) # Dic key: target node number, dic value: random walks of target node for random_walk in node2vec.walks: if not int(random_walk[0]) in walk_dic: walk_dic[int(random_walk[0])] = [] walk_dic[int(random_walk[0])].append(random_walk) # Get index of one value in one-hot vector if type(x_list[i]).__module__ == np.__name__: hot_index = np.where(x_list[i] == 1.0)[1] else: hot_index = np.where(x_list[i].to('cpu').numpy() == 1.0)[1] # Unify to Node Feature node_random_walk_list2 = [] for a in range(len(adj)): walks = walk_dic[a] walks_list = [] for walk in walks: walk2 = [] for node in walk: if not int(node) >= len(hot_index): walk2.append(float(hot_index[int(node)])) # Padding and append walks_list.append([0.0] * (walk_length - len(walk2)) + walk2) node_random_walk_list2.append(np.array(walks_list)) node_random_walk_list.append(np.array(node_random_walk_list2)) return node_random_walk_list
def node_embeddings(G, f, dim=20, walk_length=16, num_walks=100, workers=2): """ Adds the embeddings of the nodes to the dataframe f. G: a networkx graph. f: a pandas dataframe. dim: the dimension of the embedding. """ #https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef #https://github.com/eliorc/Medium/blob/master/Nod2Vec-FIFA17-Example.ipynb #funciona con node2vec if not (set(f.name) == set(G.nodes()) and len(f.name) == len(G.nodes())): raise ValueError( 'Los tamaños del grafo y del dataframe no son inguales') from node2vec import Node2Vec node2vec = Node2Vec(G, dimensions=dim, walk_length=walk_length, num_walks=num_walks, workers=workers) model = node2vec.fit(window=10, min_count=1) embeddings_df = pd.DataFrame( columns=['name'] + ['node_embeddings_' + str(i) for i in range(dim)]) embeddings_df['name'] = f['name'] for name in embeddings_df['name']: embeddings_df[embeddings_df['name'] == name] = [name] + list( model[str(name)]) f = pd.merge(f, embeddings_df, on='name') return f
def create_node2vec_embeddings(self): # path1 = os.path.join(self.dataset, 'Node2Vec_embedding.pickle') # path2 = os.path.join(self.dataset, 'Node2Vec_embedding.csv') # if os.path.exists(path1): # with open(path1, 'rb') as handle: # dict_embeddings = pickle.load(handle) # elif os.path.exists(path2): # embedding_df = pd.read_csv(path2) # dict_embeddings = embedding_df.to_dict(orient='list') # with open(path2, 'wb') as handle: # pickle.dump(dict_embeddings, handle, protocol=3) # else: # node2vec = Node2Vec(self.graph, dimensions=16, walk_length=30, num_walks=200, workers=1) # model = node2vec.fit() # nodes = list(self.graph.nodes()) # dict_embeddings = {} # for i in range(len(nodes)): # dict_embeddings.update({nodes[i]: np.asarray(model.wv.get_vector(nodes[i]))}) # with open(path1, 'wb') as handle: # pickle.dump(dict_embeddings, handle, protocol=3) node2vec = Node2Vec(self.graph, dimensions=self.dim, walk_length=80, num_walks=16, workers=2) model = node2vec.fit() nodes = list(self.graph.nodes()) dict_embeddings = {} for i in range(len(nodes)): dict_embeddings.update({nodes[i]: np.asarray(model.wv.get_vector(str(nodes[i])))}) return dict_embeddings
def node2vec(self): if self.nodeGraph == '': return # self.graph = self.nodeGraph beginTime = time.time() print('1: node2vec Begin') # Precompute probabilities and generate walks # g = self.graph # g = deepcopy(self.graph) # if self.attriNode: # g.remove_nodes_from(self.attriNode) node2vec = Node2Vec(self.nodeGraph, dimensions=16, walk_length=30, num_walks=200, p=self.P, q=self.Q, workers=4) print('Time of Node2Vec', time.time() - beginTime) beginTime = time.time() # Embed # Any keywords acceptable by gensim.Word2Vec can be passed, # `dimensions` and `workers` are automatically passed (from the Node2Vec constructor) self.model = node2vec.fit(window=10, min_count=1, batch_words=4) print('Time of FIT', time.time() - beginTime)
def main(argv): inputfile, outputfile = read_args(argv) if not path.exists(outputfile): wan = WAN(inputfile) print(wan.graph.number_of_edges()) wan.prune_edge(min_weight=1) print(wan.graph.number_of_edges()) wan.prune_node(min_freq=2) print(wan.graph.number_of_edges()) wan.reverse_weight() node2vec = Node2Vec(wan.graph, dimensions=100, walk_length=80, num_walks=200, workers=4) model = node2vec.fit(window=10, min_count=1, batch_words=4) model.wv.save_word2vec_format(outputfile) word_embeddings = model.wv.vectors word_embeddings_labels = model.wv.index2word else: word_embeddings, word_embeddings_labels = read_word2vec(outputfile) X_embedded = TSNE(n_components=2).fit_transform(word_embeddings) viz = Visualizer() viz.scatter_plot(X_embedded[:, 0], X_embedded[:, 1], word_embeddings_labels)
def emb_graph_2vec(inputpath, dim): print("input name will be ", inputpath) emb_name = inputpath.replace("weighted_edglist_filytypeTxt.edgelist", "") print("emb_name will be ", emb_name) savename = inputpath.replace("weighted_edglist_filytypeTxt.edgelist", ".emb") print("emb outfile name will be ", savename) if os.path.exists(savename): print("file alread exists in cache, please rename") sys.exit(1) graph = nx.read_edgelist(inputpath, create_using=nx.DiGraph()) # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1** node2vec = Node2Vec(graph, dimensions=dim, walk_length=30, num_walks=200, workers=10) # Embed nodes print("training .... ") model = node2vec.fit(window=10, min_count=1, batch_words=4) print("training finished saving result... ") print("saving %s file to disk " % savename) # Save embeddings for later use model.wv.save_word2vec_format(savename) print("done")
def train(self, graph, dimensions=64, walk_length=30, num_walks=200, workers=1, window=10, min_count=1, batch_words=4): if os.path.exists('../Result/EMBEDDING_MODEL'): model = KeyedVectors.load_word2vec_format( '../Result/EMBEDDING_MODEL') Embedding.model = model node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers) self.saveWalk(node2vec) model = node2vec.fit(window=window, min_count=min_count, batch_words=batch_words) model.wv.save_word2vec_format('../Result/EMBEDDING_MODEL') Embedding.model = model return model
def create_node2vec_embeddings(self): path1 = os.path.join(self.data_name, 'Node2Vec_embedding_old.pickle') path2 = os.path.join(self.data_name, 'Node2Vec_embedding_old.csv') if os.path.exists(path1): with open(path1, 'rb') as handle: dict_embeddings = pickle.load(handle) elif os.path.exists(path2): embedding_df = pd.read_csv(path2) dict_embeddings = embedding_df.to_dict(orient='list') with open(path2, 'wb') as handle: pickle.dump(dict_embeddings, handle, protocol=3) else: node2vec = Node2Vec(self.graph, dimensions=16, walk_length=30, num_walks=200, workers=1) model = node2vec.fit() nodes = list(self.graph.nodes()) dict_embeddings = {} for i in range(len(nodes)): dict_embeddings.update( {nodes[i]: np.asarray(model.wv.get_vector(nodes[i]))}) with open(path1, 'wb') as handle: pickle.dump(dict_embeddings, handle, protocol=3) return dict_embeddings
def __init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions, window_size, workers, iterations, config, sparql, dataset, entities, default_graph, entity_class, feedback_file): Node2Vec.__init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions, window_size, workers, iterations) self.config_file = config self.sparql = sparql self.default_graph = default_graph self.dataset = dataset self.entities = entities self.entity_class = entity_class self.feedback_file = feedback_file self._define_properties()