Example #1
0
 def test_unfactored_edgelist_undirected(self):
     """
     Undirected edgelist reading works
     Even on disconnected graphs
     """
     fname = "./data/unfactored_edgelist.csv"
     ### FIX FIX FIX ###
     G = cg.read_edgelist(fname, directed=False, sep=',')
     nxG = cg.csrgraph(
         nx.read_edgelist(
             fname,
             delimiter=',',
             create_using=nx.Graph(),
         ))
     self.assertEqual(G.src.size, nxG.src.size)
     self.assertEqual(G.dst.size, nxG.dst.size)
     self.assertEqual(G.weights.size, nxG.weights.size)
     self.assertEqual(G.weights.sum(), nxG.weights.sum())
     # The number of edges on source nodes should have same statistics
     Gdiff = np.diff(G.src)
     nxGdiff = np.diff(nxG.src)
     self.assertEqual(int(Gdiff.mean() * 1000), int(nxGdiff.mean() * 1000))
     self.assertEqual(int(Gdiff.std() * 1000), int(nxGdiff.std() * 1000))
     self.assertEqual(Gdiff.min(), nxGdiff.min())
     self.assertEqual(Gdiff.max(), nxGdiff.max())
     for i in range(1, 10):
         self.assertEqual(np.quantile(Gdiff, i / 10),
                          np.quantile(nxGdiff, i / 10))
def load_graph_csrgraph(edge_path: str, has_weights: bool,
                        **kwargs: Dict) -> cg.csrgraph:
    """Load graph object using CSRgraph.

    Parameters
    -----------------------
    edge_path: str,
        Path from where to load the edgelist.
        File is expected to be in directed fashion and sorted.
        The node IDs will be extracted from the numeric node IDs of the graph.
        The file is expected to be without header and the first column
        is expected to be the sources, while the second is expected to be
        the destinations. The third column, optionally, is expected to
        contain the weights if they are present in the considered graph.
    **kwargs: Dict,
        Additional parameters that are used in other libraries but not this one.

    Returns
    -------------------------
    The loaded graph.
    """
    return cg.read_edgelist(
        build_directed_path(edge_path, directed=False),
        sep="\t",
    )
Example #3
0
def get_drugbank_ddi(dir_name="../data/bioNEV/DrugBank_DDI"):
    """
    DrugBank DDI dataset from BioNEV
    """
    G = csrgraph.read_edgelist(dir_name + "/DrugBank_DDI.edgelist", sep=' ')
    G = nx.Graph(G.mat)
    return G
Example #4
0
 def test_string_karate(self):
     N_NODES = 35
     STR_LEN = 10
     fname = "./data/karate_edges.txt"
     df = pd.read_csv(fname, sep="\t", header=None)
     # string node names for each node ID
     new_names = [
         ''.join(
             random.choice(string.ascii_uppercase) for _ in range(STR_LEN))
         for i in range(N_NODES)
     ]
     # Map node ID -> new node name
     name_dict = dict(zip(np.arange(N_NODES), new_names))
     for c in df.columns:
         df[c] = df[c].map(name_dict)
     # Pass this new data to read_edgelist
     data = io.StringIO(df.to_csv(index=False, header=False))
     G = cg.read_edgelist(data, sep=',')
     # re-read original graph
     df2 = pd.read_csv(fname, sep="\t", header=None)
     # re-map IDs to string node names
     for c in df2.columns:
         df2[c] = df2[c].map(name_dict)
     df2.columns = ['src', 'dst']
     for i in range(len(df2)):
         s = df2.iloc[i].src
         d = df2.iloc[i].dst
         # addressing graph by __getitem__ with str
         # should return list of str node names
         self.assertTrue(d in G[s])
     # Only those edges are present
     m = G.mat.todense()
     self.assertTrue(m.sum() == 154)
Example #5
0
 def test_node2vec_factored_names(self):
     tt = cg.read_edgelist("./tests/unfactored_edgelist.csv", sep=",")
     ndim = 3
     skle = nodevectors.Node2Vec(walklen=5,
                                 epochs=5,
                                 threads=1,
                                 n_components=ndim,
                                 keep_walks=True,
                                 verbose=False,
                                 w2vparams={
                                     "window": 3,
                                     "negative": 3,
                                     "iter": 3,
                                     "batch_words": 32,
                                     "workers": 2
                                 })
     skle.fit(tt)
     res_v = skle.predict(9)
     self.assertTrue(len(res_v) == ndim)
     # Test save/load
     fname = 'test_saving'
     try:
         skle.save(fname)
         g2v_l = nodevectors.SKLearnEmbedder.load(fname + '.zip')
         res_l = g2v_l.predict(9)
         self.assertTrue(len(res_l) == ndim)
         np.testing.assert_array_almost_equal(res_l, res_v)
     finally:
         os.remove(fname + '.zip')
Example #6
0
def get_mashup_ppi(dir_name="../data/bioNEV/Mashup_PPI"):
    """
    DrugBank DDI dataset from BioNEV
    """
    G = csrgraph.read_edgelist(dir_name + "/Mashup_PPI.edgelist", sep=' ')
    labels = read_bionev_labels(dir_name + "/Mashup_PPI_labels.txt")
    G = nx.Graph(G.mat)
    return G, labels
Example #7
0
def get_n2v_ppi(dir_name="../data/bioNEV/node2vec_PPI"):
    """
    Node2vec PPI dataset from BioNEV
    """
    G = csrgraph.read_edgelist(dir_name + "/node2vec_PPI.edgelist", sep=' ')
    labels = read_bionev_labels(dir_name + "/node2vec_PPI_labels.txt")
    G = nx.Graph(G.mat)
    return G, labels
Example #8
0
 def test_float_weights_reading(self):
     fname = "./data/karate_edges.txt"
     df = pd.read_csv(fname, sep="\t", header=None)
     df['weights'] = np.random.rand(df.shape[0])
     data = io.StringIO(df.to_csv(index=False, header=False))
     G = cg.read_edgelist(data, sep=',')
     self.assertTrue((G.weights < 1).all())
     self.assertTrue((G.weights > 0).all())
Example #9
0
 def test_n2v_bounds(self):
     """
     Bug on node2vec random walks being segfault/out-of-bounds
     Should be fixed forever
     """
     G = cg.read_edgelist("./data/wiki_edgelist.txt")
     rw = G.random_walks(return_weight=0.2)
     self.assertEqual(int(G.nodes().max()), rw.max())
     self.assertEqual(int(G.nodes().min()), rw.min())
Example #10
0
 def test_int_weights_reading(self):
     WEIGHT_VALUE = 5
     fname = "./data/karate_edges.txt"
     df = pd.read_csv(fname, sep="\t", header=None)
     df['weights'] = np.ones(df.shape[0]) * WEIGHT_VALUE
     data = io.StringIO(df.to_csv(index=False, header=False))
     G = cg.read_edgelist(data, sep=',')
     self.assertTrue((G.weights == WEIGHT_VALUE).all())
     self.assertTrue((G.weights == WEIGHT_VALUE).all())
Example #11
0
 def test_largenumbererror(self):
     fname = "./data/largenumbererror.csv"
     G = cg.read_edgelist(fname, sep=',')
     self.assertTrue(len(G.nodes()) == 4)
     # These two have no edges
     self.assertTrue(len(G[44444444444444]) == 0)
     self.assertTrue(len(G[222222222222]) == 0)
     # These two have one edge
     self.assertTrue(len(G[333333333333333]) == 1)
     self.assertTrue(len(G[1111111111111]) == 1)
Example #12
0
 def compute(self, file):
     G = cg.read_edgelist(f=file, header=0)
     print("File read")
     ggvec_model = nodevectors.GGVec(order=1)
     print("Model Created")
     start = time.time()
     embeddings = ggvec_model.fit_transform(G)
     end = time.time() - start
     print("Embedding obtained:", end)
     print(embeddings[1])
     return embeddings, G.names
Example #13
0
 def test_karate(self):
     fname = "./data/karate_edges.txt"
     G = cg.read_edgelist(fname)
     m = G.mat.todense()
     df = pd.read_csv(fname, sep="\t", header=None)
     df.columns = ['src', 'dst']
     for i in range(len(df)):
         s = df.iloc[i].src
         d = df.iloc[i].dst
         if m[s - 1, d - 1] != 1:
             raise ValueError(f"For src {s}, dst {d}, error {m}")
             self.assertEqual(m[s - 1, d - 1], 1)
     # Only those edges are present
     self.assertTrue(m.sum() == 154)
Example #14
0
 def test_node2vec_fit_transform(self):
     tt = cg.read_edgelist("./tests/unfactored_edgelist.csv", sep=",")
     ndim = 3
     skle = nodevectors.Node2Vec(walklen=5,
                                 epochs=5,
                                 threads=1,
                                 n_components=ndim,
                                 keep_walks=True,
                                 verbose=False,
                                 w2vparams={
                                     "window": 3,
                                     "negative": 3,
                                     "iter": 3,
                                     "batch_words": 32,
                                     "workers": 2
                                 })
     skle.fit_transform(tt)
 def __get_csr_graph(self) -> csrgraph.csrgraph:
     if not os.path.exists(self.csv_path):
         Logger.info("Couldn't find citation CSV, generating from database...")
         create_citations_csv()
     Logger.info("Initializing CSR graph for embedding training...")
     return csrgraph.read_edgelist(self.csv_path, directed=False, sep=",")
Example #16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Date    : 2021-01-24
# @Contact    : [email protected]
import csrgraph as cg
from joblib import dump
from nodevectors import Node2Vec

G = cg.read_edgelist("data/graph_data.csv", directed=False, sep=',')
node2vec = Node2Vec(threads=6, n_components=100, w2vparams=dict(workers=12))
node2vec.fit(G)
print(node2vec)
dump(node2vec, "data/node2vec.pkl")
Example #17
0
def main():
    parser = argparse.ArgumentParser(
        description='A wrapper for running Node2Vec on Very Large Graphs')
    parser.add_argument(
        '-e',
        '--edgelist',
        help='Name/path to text file containing graph edge list',
        required=True)
    parser.add_argument('-d',
                        '--dim',
                        help='Embedding dimensions',
                        required=True)
    parser.add_argument('-l',
                        '--walklen',
                        help='Random walk length',
                        required=True)
    parser.add_argument('-r',
                        '--walknum',
                        help='Number of walks',
                        required=True)
    parser.add_argument('-t', '--threads', help='# threads to use', default=0)
    parser.add_argument('-p',
                        '--return_weight',
                        help='Return node probability',
                        default=1.)
    parser.add_argument('-q',
                        '--explore_weight',
                        help='Node visit probability',
                        default=1.)
    parser.add_argument('-k',
                        '--window',
                        help='Context window size',
                        required=True)
    parser.add_argument('-w',
                        '--keep_walks',
                        help='Save the random walks',
                        default=False)
    parser.add_argument('-m',
                        '--save_model',
                        help='Save Gensim node2vec model',
                        default=False)
    args = parser.parse_args()

    # print user parameters to console
    print(
        '\n#######################################################################\n'
    )
    print('NODE2VEC Parameters:')
    print('Edge List: {input_file}'.format(
        input_file=args.edgelist.split('/')[-1]))
    print('Embedding Dimensions: {dim}'.format(dim=args.dim))
    print('Random walk Length: {walk_len}'.format(walk_len=args.walklen))
    print('Number of random walks: {walk_num}'.format(walk_num=args.walknum))
    print('Threads: {threads}'.format(threads=args.threads))
    print('Return Weight (p): {p}'.format(p=args.return_weight))
    print('Explore Weight (q): {q}'.format(q=args.explore_weight))
    print('Context Window Size: {window_size}'.format(window_size=args.window))
    print('Save Random Walks with Node2Vec Model: {keep_walks}'.format(
        keep_walks=args.keep_walks))
    print('Save Gensim Node2Vec Model: {save_model}'.format(
        save_model=args.save_model))
    print('Embedding output: {write_loc}'.format(
        write_loc=args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb'))
    print(
        '\n#######################################################################\n'
    )

    print('\n#### STEP 1: Convert Edge List to CSR Graph ####')
    graph = cg.read_edgelist(args.edgelist, sep=' ', header=None)

    print('\n#### STEP 2: Fit Embedding Model to Graph ####')
    g2v = nodevectors.Node2Vec(n_components=int(args.dim),
                               walklen=int(args.walklen),
                               epochs=int(args.walknum),
                               return_weight=float(args.return_weight),
                               neighbor_weight=float(args.explore_weight),
                               threads=int(args.threads),
                               keep_walks=args.keep_walks,
                               verbose=True,
                               w2vparams={
                                   'window': int(args.window),
                                   'iter': 10
                               })
    g2v.fit(graph)

    print('\n#### STEP 3: Save Model Output and Embeddings ####')
    # save embeddings (gensim.KeyedVector format)
    g2v.save_vectors(args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb')

    if args.save_model:
        # save node2vec model -- uses a lot of memory and takes a very long time to run on large graphs
        g2v.save(args.edgelist.split('.')[0] + '_node2vec_Model.pckl')