def get_embeddings(G, organizations): # Fit graph g2v = NVVV() g2v.fit(G) # Embeddings embeddings = g2v.model.wv.vectors pca = PCA(n_components=3) principalComponents = pca.fit_transform(embeddings) d_e = pd.DataFrame(principalComponents) d_e["company"] = organizations return d_e, g2v
def train(self): graph = self.__get_csr_graph() Logger.info("Training node2vec embeddings...") g2v = Node2Vec(n_components=32, walklen=8, epochs=25) g2v.fit(graph) Logger.info(f"Training done. Saving embeddings to {self.model_path}") g2v.save_vectors(self.model_path)
def main(args): print(args) edgelists = [qf for qf in os.listdir(args.input) if qf.endswith('.edgelist') and not qf.startswith('_')] g = None exclude = args.exclude or [] print('loading edgelists...') for eg in edgelists: if eg in exclude or eg.rsplit('.', 1)[0] in exclude: continue print('- ' + eg) h = nx.read_edgelist(os.path.join(args.input, eg), nodetype=str, create_using=nx.DiGraph(), delimiter=' ') for edge in h.edges(): h[edge[0]][edge[1]]['weight'] = 1 g = h if g is None else nx.compose(g, h) g = g.to_undirected() print('Nodes: %d' % nx.number_of_nodes(g)) print('Edges: %d' % nx.number_of_edges(g)) print('Start learning at %s' % time.asctime()) g2v = Node2Vec( walklen=args.walk_length, epochs=args.num_walks, n_components=args.dimensions, return_weight=1 / args.p, neighbor_weight=1 / args.q, threads=args.workers, w2vparams={ 'window': args.window_size, 'iter': args.iter, 'batch_words': 128, 'min_count': 0, 'negative': 25, 'sg': 1 }, verbose=True ) g2v.fit(g) print('End learning at %s' % time.asctime()) # Save model to gensim.KeyedVector format g2v.save_vectors(args.output)
def learn_embeddings(graph): """ input:- graph: nx.Graph() output:- model: node2vec """ """ n2v = Node2Vec(graph, dimensions=30, walk_length=5, num_walks=200, workers=2) model = n2v.fit(window=10, min_count=1) return model """ n2v = Node2Vec() n2v.fit(graph) return n2v
def nodevec(graph: str, output_dir: str, directed: bool, tag: str, params: dict) -> None: # Ensure directories exist directory_check(output_dir) directory_check(output_dir + "/models") directory_check(output_dir + "/embeddings") temp_dir = output_dir + "/temp" directory_check(temp_dir) w2vparams = get_w2vparams(**params) node2vec_init = get_n2vparams(w2vparams=w2vparams, **params) print("Beginning node2vec script") print("File: %s" % graph) for key, value in node2vec_init.items(): print("%s: %s" % (key, value)) for key, value in w2vparams.items(): print("%s: %s" % (key, value)) G = nx.read_gpickle(graph) G = uri_to_str(G) if not directed: G = G.to_undirected() n2v_model = Node2Vec(**node2vec_init) n2v_model.fit(G) embedding_file = generate_out_file("embeddings.pkl", out_dir + "embeddings/", tag) model_file = generate_out_file("model.pkl", out_dir + "models/", tag) # Save embeddings n2v_model.model.wv.save_word2vec_format(embedding_file) print("Embeddings saved to %s" % embedding_file) # Save model n2v_model.model.save(model_file) print("Model saved to %s" % embedding_file) print("Completed nodevectors.py")
def learn_embeddings(df, cids, show_graph=False): """ input:- df: pd.DataFrame cids: list show_graph: bool output:- graph: nx.Graph() n2v: node2vec.Node2Vec """ df = df[["cid", "pbdid", "min"]] cid_nodes = [] pbdid = df["pbdid"].values pbdid = set(pbdid) for each_id in pbdid: cid = df[df["pbdid"] == each_id]["cid"].iloc[0] cid_nodes.append(cid) graph = nx.Graph() print("Building Graph\n", "="*32, "\n") print("Adding CID to Target PBDIDs...") for i, row in enumerate(df.values): graph.add_edge(row[0], row[1]) cid_pairs = [[node, cid_] for cid_ in cids for node in cid_nodes] print("Generated Structurally related pairs") for node1, node2 in cid_pairs: graph.add_edge(node1, node2) print("Added Structurally related CIDs") if show_graph: draw_graph(graph) n2v = Node2Vec(graph, dimensions=20, walk_length=5, num_walks=200, workers=2) model = n2v.fit(window=10, min_count=1) return graph, n2v, model
pickle.dump(n2v, open("./graphs/n2v_sub_small1.pkl", "wb")) train_comp_expensive = False if train_comp_expensive: graphs_subset = [ x for x in graphs if len(related[graphs.index(x)]) > 50 ] for i, graph in tqdm(enumerate(graphs_subset), total=len(graphs_subset), leave=False): n2v = learn_embeddings(graph) n2v.save(f"./graphs/n2v_sub_huge-{i+1}.pckl") # pickle.dump(n2v, open(f"./graphs/n2v_sub_huge-{i+1}.pkl", "wb")) save_huge_vecs = False if save_huge_vecs: graphs = [ Node2Vec.load(f"./graphs/huge_graphs/n2v_sub_huge-{i}.pckl.zip") for i in range(1, 6) ] for i, graph in enumerate(graphs): graph.save_vectors(f"./vectors/wheel_mode_graph-{i}.bin") save_small_vecs = False if save_small_vecs: if not save_huge_vecs: i = 5 small_graphs = pickle.load(open("./graphs/n2v_sub_small1.pkl", "rb")) for j, graph in enumerate(small_graphs): graph.save_vectors(f"./vectors/wheel_mode_graph-{i+j}.bin")
'--output', help='The output folder', required=True) args = vars(parser.parse_args()) G = nx.read_edgelist(args["input"], delimiter='\t') embedding_size = 64 # Fit embedding model to graph g2v = Node2Vec(walklen=5, epochs=10, threads=4, n_components=embedding_size, keep_walks=False, w2vparams={ "window": 3, "negative": 3, "iter": 3, "batch_words": 64, "workers": 2 }) # way faster than other node2vec implementations # Graph edge weights are handled automatically g2v.fit(G) # query embeddings for node 42 print(g2v.predict(42)) g2v.save(os.path.join(args["output"], 'node2vec.pckl')) # Save model to gensim.KeyedVector format g2v.save_vectors(os.path.join(args["output"], "wheel_model.bin"))
import networkx as nx from nodevectors import Node2Vec import time start = time.time() # 2만개 : 약 1분 G= nx.read_gml("C:/Users/DI_Lab/Desktop/연구실 자료/국보연/전지원/Full_G.gml") print("읽은 시간 time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 start = time.time() node2vec_model = Node2Vec(n_components=32, walklen=10) node2vec_model.fit(G) print("모델 학습 시간 time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 start = time.time() node2vec_model.save("C:/Users/DI_Lab/Desktop/연구실 자료/국보연/전지원/word2vec_Full.model") print("모델 저장 시간 time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간
import networkx as nx from nodevectors import Node2Vec # Test Graph model_name = "structwords" graph_file = "data/keywords.edgelist" # G = nx.read_weighted_edgelist(graph_file) G = nx.read_weighted_edgelist(graph_file) # Fit embedding model to graph g2v = Node2Vec(neighbor_weight=3) # way faster than other node2vec implementations # Graph edge weights are handled automatically g2v.fit(G) # Save and load whole node2vec model # Uses a smart pickling method to avoid serialization errors # Don't put a file extension after the `.save()` filename, `.zip` is automatically added g2v.save(model_name)
def train(self, data): # index data self._setup_db() copy_input = io.StringIO() error_counter = 0 for i, r in enumerate( tqdm(data.iterrows(), total=len(data.index), desc="Building index")): row = [] key, x, y = get_key_x_y(r[1]) row.append(key) if isnan(x) or isnan(y): error_counter += 1 continue row.append("SRID=4326;POINT(" + str(x) + " " + str(y) + ")") copy_input.write(",".join(row) + "\n") if error_counter > 0: print("Warning could not encode " + str(error_counter) + " instances") print("Executing copy to db") copy_input.seek(0) with self.db.get_connection() as conn: with conn.cursor() as cur: cur.copy_from(copy_input, self._table_name(), sep=",", null="\"\"") print("Creating index") idx_query = "create index " + self._table_name( ) + "_loc_idx on " + self._table_name( ) + " using gist(location);" cur.execute(idx_query) print("Indexing done") return None rows = [] data.sort_values("lat", inplace=True) apply_sliding_window(data, 1, self.njobs, rows) data.sort_values("lon", inplace=True) apply_sliding_window(data, 2, self.njobs, rows) print("Created " + str(len(rows)) + " edges") elist = pd.DataFrame(rows, columns=["src", "dst", "weight"]) elist.weight = pd.to_numeric(elist.weight) # Create name mapping to normalize node IDs allnodes = list(set(elist.src.unique()).union(set(elist.dst.unique()))) # This factors all the unique nodes to unique IDs names = (np.array( pd.Series(allnodes).astype('category').cat.categories)) name_dict = dict(zip(names, np.arange(names.shape[0]))) elist.src = elist.src.map(name_dict).astype(np.uint32) elist.dst = elist.dst.map(name_dict).astype(np.uint32) elist.sort_values(by='src', inplace=True, ignore_index=True) nnodes = names.shape[0] G = _edgelist_to_wdw_graph(elist, nnodes, nodenames=names) elist = None rows = None gc.collect() # train node2vec print("Training node2vec") wdw = Node2Vec(threads=self.njobs, walklen=10, n_components=100) wdw.fit(G) print("Training complete") self.wdw = wdw
import networkx as nx from nodevectors import Node2Vec import time from gensim.models import KeyedVectors g2v = Node2Vec.load( 'C:/Users/DI_Lab/Desktop/연구실 자료/국보연/전지원/word2vec_20000test.model.zip') # Save model to gensim.KeyedVector format g2v.save_vectors("wheel_model.bin") # load in gensim print(g2v) model = KeyedVectors.load_word2vec_format("wheel_model.bin") print(model) model[str("cve-2019-1020019")]
# nodevectors and csrgraph must be installed: run 'pip install nodevectors csrgraph' to install # module nodevectors work on sparse matrix, so it much faster to fit. # GitHub link to this repository: https://github.com/VHRanger/nodevectors import pandas as pd import networkx as nx import numpy as np from nodevectors import Node2Vec edges = pd.read_csv('finnet_data/edges.csv') G = nx.from_pandas_dataframe(edges, 'id_1', 'id_2') g2v = Node2Vec(n_components=100, walklen=4) g2v.fit(G) # File size is about 2GB g2v.save_vectors("n2v.bin")
import pandas as pd import numpy as np import networkx as nx from nodevectors import Node2Vec, ProNE # import Node2Vec model to encode the graph # load the graph G = nx.read_edgelist('data/collaboration_network.edgelist', delimiter=' ', nodetype=int) n_nodes = G.number_of_nodes() n_edges = G.number_of_edges() print('Number of nodes:', n_nodes) print('Number of edges:', n_edges) # Create an embedding of the graph g2v = Node2Vec(n_components= 32, walklen= 10) #g2v = ProNE(step = 6,n_components= 32) # other model that we tried # Fit the model g2v.fit(G) # Get the embedding of each node Embeddings = {} for u in G.nodes: Embeddings[u] = g2v.predict(u) # transform the embedding to pandas array and save it df = pd.DataFrame.from_dict(Embeddings) df.to_csv('data/graph_embedding.csv', index=False)
from nodevectors import Node2Vec model_name = "keywords_deep" idx_file = "data/word_index.pickle" keywords_file = "data/mag_cs_keywords.csv" # Load in relevant data and modules keywords_full_data = pd.read_csv(keywords_file) keywords_full_data['normalizedName'] = keywords_full_data[ 'normalizedName'].fillna('nan') keywords_data = keywords_full_data['normalizedName'] with open(idx_file, 'rb') as f: word_to_idx = pickle.load(f) keyword_embs = Node2Vec.load(model_name + ".zip") # Process word queries while True: print("Please enter a word to search: ") query_word = input() query_node_idx = -1 query_node = None while query_node_idx < 0: try: query_node_idx = word_to_idx[query_word.lower()] query_node = keyword_embs.predict(query_node_idx) except: print(
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Date : 2021-01-24 # @Contact : [email protected] import csrgraph as cg from joblib import dump from nodevectors import Node2Vec G = cg.read_edgelist("data/graph_data.csv", directed=False, sep=',') node2vec = Node2Vec(threads=6, n_components=100, w2vparams=dict(workers=12)) node2vec.fit(G) print(node2vec) dump(node2vec, "data/node2vec.pkl")