def test_unfactored_edgelist_undirected(self): """ Undirected edgelist reading works Even on disconnected graphs """ fname = "./data/unfactored_edgelist.csv" ### FIX FIX FIX ### G = cg.read_edgelist(fname, directed=False, sep=',') nxG = cg.csrgraph( nx.read_edgelist( fname, delimiter=',', create_using=nx.Graph(), )) self.assertEqual(G.src.size, nxG.src.size) self.assertEqual(G.dst.size, nxG.dst.size) self.assertEqual(G.weights.size, nxG.weights.size) self.assertEqual(G.weights.sum(), nxG.weights.sum()) # The number of edges on source nodes should have same statistics Gdiff = np.diff(G.src) nxGdiff = np.diff(nxG.src) self.assertEqual(int(Gdiff.mean() * 1000), int(nxGdiff.mean() * 1000)) self.assertEqual(int(Gdiff.std() * 1000), int(nxGdiff.std() * 1000)) self.assertEqual(Gdiff.min(), nxGdiff.min()) self.assertEqual(Gdiff.max(), nxGdiff.max()) for i in range(1, 10): self.assertEqual(np.quantile(Gdiff, i / 10), np.quantile(nxGdiff, i / 10))
def load_graph_csrgraph(edge_path: str, has_weights: bool, **kwargs: Dict) -> cg.csrgraph: """Load graph object using CSRgraph. Parameters ----------------------- edge_path: str, Path from where to load the edgelist. File is expected to be in directed fashion and sorted. The node IDs will be extracted from the numeric node IDs of the graph. The file is expected to be without header and the first column is expected to be the sources, while the second is expected to be the destinations. The third column, optionally, is expected to contain the weights if they are present in the considered graph. **kwargs: Dict, Additional parameters that are used in other libraries but not this one. Returns ------------------------- The loaded graph. """ return cg.read_edgelist( build_directed_path(edge_path, directed=False), sep="\t", )
def get_drugbank_ddi(dir_name="../data/bioNEV/DrugBank_DDI"): """ DrugBank DDI dataset from BioNEV """ G = csrgraph.read_edgelist(dir_name + "/DrugBank_DDI.edgelist", sep=' ') G = nx.Graph(G.mat) return G
def test_string_karate(self): N_NODES = 35 STR_LEN = 10 fname = "./data/karate_edges.txt" df = pd.read_csv(fname, sep="\t", header=None) # string node names for each node ID new_names = [ ''.join( random.choice(string.ascii_uppercase) for _ in range(STR_LEN)) for i in range(N_NODES) ] # Map node ID -> new node name name_dict = dict(zip(np.arange(N_NODES), new_names)) for c in df.columns: df[c] = df[c].map(name_dict) # Pass this new data to read_edgelist data = io.StringIO(df.to_csv(index=False, header=False)) G = cg.read_edgelist(data, sep=',') # re-read original graph df2 = pd.read_csv(fname, sep="\t", header=None) # re-map IDs to string node names for c in df2.columns: df2[c] = df2[c].map(name_dict) df2.columns = ['src', 'dst'] for i in range(len(df2)): s = df2.iloc[i].src d = df2.iloc[i].dst # addressing graph by __getitem__ with str # should return list of str node names self.assertTrue(d in G[s]) # Only those edges are present m = G.mat.todense() self.assertTrue(m.sum() == 154)
def test_node2vec_factored_names(self): tt = cg.read_edgelist("./tests/unfactored_edgelist.csv", sep=",") ndim = 3 skle = nodevectors.Node2Vec(walklen=5, epochs=5, threads=1, n_components=ndim, keep_walks=True, verbose=False, w2vparams={ "window": 3, "negative": 3, "iter": 3, "batch_words": 32, "workers": 2 }) skle.fit(tt) res_v = skle.predict(9) self.assertTrue(len(res_v) == ndim) # Test save/load fname = 'test_saving' try: skle.save(fname) g2v_l = nodevectors.SKLearnEmbedder.load(fname + '.zip') res_l = g2v_l.predict(9) self.assertTrue(len(res_l) == ndim) np.testing.assert_array_almost_equal(res_l, res_v) finally: os.remove(fname + '.zip')
def get_mashup_ppi(dir_name="../data/bioNEV/Mashup_PPI"): """ DrugBank DDI dataset from BioNEV """ G = csrgraph.read_edgelist(dir_name + "/Mashup_PPI.edgelist", sep=' ') labels = read_bionev_labels(dir_name + "/Mashup_PPI_labels.txt") G = nx.Graph(G.mat) return G, labels
def get_n2v_ppi(dir_name="../data/bioNEV/node2vec_PPI"): """ Node2vec PPI dataset from BioNEV """ G = csrgraph.read_edgelist(dir_name + "/node2vec_PPI.edgelist", sep=' ') labels = read_bionev_labels(dir_name + "/node2vec_PPI_labels.txt") G = nx.Graph(G.mat) return G, labels
def test_float_weights_reading(self): fname = "./data/karate_edges.txt" df = pd.read_csv(fname, sep="\t", header=None) df['weights'] = np.random.rand(df.shape[0]) data = io.StringIO(df.to_csv(index=False, header=False)) G = cg.read_edgelist(data, sep=',') self.assertTrue((G.weights < 1).all()) self.assertTrue((G.weights > 0).all())
def test_n2v_bounds(self): """ Bug on node2vec random walks being segfault/out-of-bounds Should be fixed forever """ G = cg.read_edgelist("./data/wiki_edgelist.txt") rw = G.random_walks(return_weight=0.2) self.assertEqual(int(G.nodes().max()), rw.max()) self.assertEqual(int(G.nodes().min()), rw.min())
def test_int_weights_reading(self): WEIGHT_VALUE = 5 fname = "./data/karate_edges.txt" df = pd.read_csv(fname, sep="\t", header=None) df['weights'] = np.ones(df.shape[0]) * WEIGHT_VALUE data = io.StringIO(df.to_csv(index=False, header=False)) G = cg.read_edgelist(data, sep=',') self.assertTrue((G.weights == WEIGHT_VALUE).all()) self.assertTrue((G.weights == WEIGHT_VALUE).all())
def test_largenumbererror(self): fname = "./data/largenumbererror.csv" G = cg.read_edgelist(fname, sep=',') self.assertTrue(len(G.nodes()) == 4) # These two have no edges self.assertTrue(len(G[44444444444444]) == 0) self.assertTrue(len(G[222222222222]) == 0) # These two have one edge self.assertTrue(len(G[333333333333333]) == 1) self.assertTrue(len(G[1111111111111]) == 1)
def compute(self, file): G = cg.read_edgelist(f=file, header=0) print("File read") ggvec_model = nodevectors.GGVec(order=1) print("Model Created") start = time.time() embeddings = ggvec_model.fit_transform(G) end = time.time() - start print("Embedding obtained:", end) print(embeddings[1]) return embeddings, G.names
def test_karate(self): fname = "./data/karate_edges.txt" G = cg.read_edgelist(fname) m = G.mat.todense() df = pd.read_csv(fname, sep="\t", header=None) df.columns = ['src', 'dst'] for i in range(len(df)): s = df.iloc[i].src d = df.iloc[i].dst if m[s - 1, d - 1] != 1: raise ValueError(f"For src {s}, dst {d}, error {m}") self.assertEqual(m[s - 1, d - 1], 1) # Only those edges are present self.assertTrue(m.sum() == 154)
def test_node2vec_fit_transform(self): tt = cg.read_edgelist("./tests/unfactored_edgelist.csv", sep=",") ndim = 3 skle = nodevectors.Node2Vec(walklen=5, epochs=5, threads=1, n_components=ndim, keep_walks=True, verbose=False, w2vparams={ "window": 3, "negative": 3, "iter": 3, "batch_words": 32, "workers": 2 }) skle.fit_transform(tt)
def __get_csr_graph(self) -> csrgraph.csrgraph: if not os.path.exists(self.csv_path): Logger.info("Couldn't find citation CSV, generating from database...") create_citations_csv() Logger.info("Initializing CSR graph for embedding training...") return csrgraph.read_edgelist(self.csv_path, directed=False, sep=",")
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Date : 2021-01-24 # @Contact : [email protected] import csrgraph as cg from joblib import dump from nodevectors import Node2Vec G = cg.read_edgelist("data/graph_data.csv", directed=False, sep=',') node2vec = Node2Vec(threads=6, n_components=100, w2vparams=dict(workers=12)) node2vec.fit(G) print(node2vec) dump(node2vec, "data/node2vec.pkl")
def main(): parser = argparse.ArgumentParser( description='A wrapper for running Node2Vec on Very Large Graphs') parser.add_argument( '-e', '--edgelist', help='Name/path to text file containing graph edge list', required=True) parser.add_argument('-d', '--dim', help='Embedding dimensions', required=True) parser.add_argument('-l', '--walklen', help='Random walk length', required=True) parser.add_argument('-r', '--walknum', help='Number of walks', required=True) parser.add_argument('-t', '--threads', help='# threads to use', default=0) parser.add_argument('-p', '--return_weight', help='Return node probability', default=1.) parser.add_argument('-q', '--explore_weight', help='Node visit probability', default=1.) parser.add_argument('-k', '--window', help='Context window size', required=True) parser.add_argument('-w', '--keep_walks', help='Save the random walks', default=False) parser.add_argument('-m', '--save_model', help='Save Gensim node2vec model', default=False) args = parser.parse_args() # print user parameters to console print( '\n#######################################################################\n' ) print('NODE2VEC Parameters:') print('Edge List: {input_file}'.format( input_file=args.edgelist.split('/')[-1])) print('Embedding Dimensions: {dim}'.format(dim=args.dim)) print('Random walk Length: {walk_len}'.format(walk_len=args.walklen)) print('Number of random walks: {walk_num}'.format(walk_num=args.walknum)) print('Threads: {threads}'.format(threads=args.threads)) print('Return Weight (p): {p}'.format(p=args.return_weight)) print('Explore Weight (q): {q}'.format(q=args.explore_weight)) print('Context Window Size: {window_size}'.format(window_size=args.window)) print('Save Random Walks with Node2Vec Model: {keep_walks}'.format( keep_walks=args.keep_walks)) print('Save Gensim Node2Vec Model: {save_model}'.format( save_model=args.save_model)) print('Embedding output: {write_loc}'.format( write_loc=args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb')) print( '\n#######################################################################\n' ) print('\n#### STEP 1: Convert Edge List to CSR Graph ####') graph = cg.read_edgelist(args.edgelist, sep=' ', header=None) print('\n#### STEP 2: Fit Embedding Model to Graph ####') g2v = nodevectors.Node2Vec(n_components=int(args.dim), walklen=int(args.walklen), epochs=int(args.walknum), return_weight=float(args.return_weight), neighbor_weight=float(args.explore_weight), threads=int(args.threads), keep_walks=args.keep_walks, verbose=True, w2vparams={ 'window': int(args.window), 'iter': 10 }) g2v.fit(graph) print('\n#### STEP 3: Save Model Output and Embeddings ####') # save embeddings (gensim.KeyedVector format) g2v.save_vectors(args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb') if args.save_model: # save node2vec model -- uses a lot of memory and takes a very long time to run on large graphs g2v.save(args.edgelist.split('.')[0] + '_node2vec_Model.pckl')