Example #1
0
def test_jaccard_multi_column(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    G1 = cugraph.Graph()
    G1.from_cudf_edgelist(cu_M,
                          source=["src_0", "src_1"],
                          destination=["dst_0", "dst_1"])

    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
    vertex_pair = vertex_pair[:5]

    df_res = cugraph.jaccard(G1, vertex_pair)

    G2 = cugraph.Graph()
    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
    df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]])

    # Calculating mismatch
    assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"])
Example #2
0
def test_jaccard_multi_column(read_csv):

    M, _ = read_csv

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    G1 = cugraph.Graph()
    G1.from_cudf_edgelist(cu_M,
                          source=["src_0", "src_1"],
                          destination=["dst_0", "dst_1"])

    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
    vertex_pair = vertex_pair[:5]

    df_res = cugraph.jaccard(G1, vertex_pair)

    G2 = cugraph.Graph()
    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
    df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]])

    # Calculating mismatch
    actual = df_res.sort_values("0_source").reset_index()
    expected = df_exp.sort_values("source").reset_index()
    assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
Example #3
0
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file):
    gc.collect()

    rmm.reinitialize(managed_memory=managed,
                     pool_allocator=pool,
                     initial_pool_size=2 << 27)

    assert (rmm.is_initialized())

    M = utils.read_csv_for_nx(graph_file)
    cu_M = utils.read_csv_file(graph_file)

    Gnx = nx.from_pandas_edgelist(M,
                                  source='0',
                                  target='1',
                                  edge_attr='weight',
                                  create_using=nx.Graph())
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2')
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs)
    df = df.sort_values(by=['source', 'destination'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Example #4
0
 def predict(self, first, second):
     return (
         cugraph.jaccard(
             self._G, first.astype("int32"), second.astype("int32")
         )
         > self._threshold
     )
Example #5
0
def test_jaccard_two_hop_edge_vals(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    cu_M = utils.read_csv_file(graph_file)

    Gnx = nx.from_pandas_edgelist(M,
                                  source="0",
                                  target="1",
                                  edge_attr="weight",
                                  create_using=nx.Graph())
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")

    pairs = (G.get_two_hop_neighbors().sort_values(["first", "second"
                                                    ]).reset_index(drop=True))

    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs["first"].iloc[i], pairs["second"].iloc[i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs)
    df = df.sort_values(by=["source", "destination"]).reset_index(drop=True)
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i])
        assert diff < 1.0e-6
Example #6
0
def test_jaccard_two_hop(managed, pool, graph_file):
    gc.collect()

    rmm.finalize()
    rmm_cfg.use_managed_memory = managed
    rmm_cfg.use_pool_allocator = pool
    rmm.initialize()

    assert (rmm.is_initialized())

    M = read_mtx_file(graph_file)
    M = M.tocsr()
    Gnx = nx.DiGraph(M).to_undirected()
    G = cugraph.Graph()
    row_offsets = cudf.Series(M.indptr)
    col_indices = cudf.Series(M.indices)
    G.add_adj_list(row_offsets, col_indices, None)
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs['first'], pairs['second'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Example #7
0
def test_jaccard_two_hop(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    cu_M = utils.read_csv_file(graph_file)

    Gnx = nx.from_pandas_edgelist(M,
                                  source='0',
                                  target='1',
                                  create_using=nx.Graph())
    G = cugraph.Graph()
    G.from_cudf_edgelist(cu_M, source='0', destination='1')
    pairs = G.get_two_hop_neighbors()
    print(pairs)
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'].iloc[i], pairs['second'].iloc[i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs)
    df = df.sort_values(by=['source', 'destination'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'].iloc[i])
        assert diff < 1.0e-6
Example #8
0
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file):
    gc.collect()

    rmm.reinitialize(
        managed_memory=managed,
        pool_allocator=pool,
        initial_pool_size=2 << 27
    )

    assert(rmm.is_initialized())

    M = utils.read_csv_for_nx(graph_file)
    M = M.tocsr()
    Gnx = nx.DiGraph(M).to_undirected()
    G = cugraph.Graph()
    row_offsets = cudf.Series(M.indptr)
    col_indices = cudf.Series(M.indices)
    values = cudf.Series(M.data)
    G.from_cudf_adjlist(row_offsets, col_indices, values)
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs['first'], pairs['second'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Example #9
0
def cugraph_call(cu_M, edgevals=False):
    '''M = M.tocsr()
    if M is None:
        raise TypeError('Could not read the input graph')
    if M.shape[0] != M.shape[1]:
        raise TypeError('Shape is not square')
    '''
    # Device data
    sources = cu_M['0']
    destinations = cu_M['1']
    if edgevals is False:
        values = None
    else:
        values = cu_M['2']

    G = cugraph.Graph()
    G.add_edge_list(sources, destinations, values)

    # cugraph Jaccard Call
    t1 = time.time()
    df = cugraph.jaccard(G)
    t2 = time.time() - t1
    print('Time : ' + str(t2))

    return df['source'].to_array(), df['destination'].to_array(),\
        df['jaccard_coeff'].to_array()
Example #10
0
def cugraph_call(cu_M, edgevals=False):
    G = cugraph.Graph()
    if edgevals is True:
        G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2')
    else:
        G.from_cudf_edgelist(cu_M, source='0', destination='1')

    # cugraph Jaccard Call
    t1 = time.time()
    df = cugraph.jaccard(G)
    t2 = time.time() - t1
    print('Time : ' + str(t2))
    print(df)
    return df['source'].to_array(), df['destination'].to_array(),\
        df['jaccard_coeff'].to_array()
Example #11
0
    def train(self, val_edges, val_edges_false):
        validation_set = np.array(
            list(tuple(e) for e in val_edges)
            + list(tuple(e) for e in val_edges_false)
        )
        surface = cugraph.jaccard(
            self._G,
            first=cudf.Series(validation_set[:, 0]).astype("int32"),
            second=cudf.Series(validation_set[:, 1]).astype("int32"),
        )
        actual = np.array([1] * len(val_edges) + [0] * len(val_edges_false))

        def _func(threshold):
            pred = surface.iloc[:, 2] > threshold
            return roc_auc_score(actual, pred)

        self._threshold = max(np.arange(0.0, 1.0, 0.01), key=_func)
        return self
Example #12
0
def compare_jaccard_two_hop(G, Gnx):
    """
    Compute both cugraph and nx jaccard after extracting the two hop neighbors
    from G and compare both results
    """
    pairs = (G.get_two_hop_neighbors().sort_values(["first", "second"
                                                    ]).reset_index(drop=True))

    nx_pairs = list(pairs.to_records(index=False))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs)
    df = df.sort_values(by=["source", "destination"]).reset_index(drop=True)
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i])
        assert diff < 1.0e-6
Example #13
0
def cugraph_call(cu_M, edgevals=False):
    G = cugraph.Graph()
    if edgevals is True:
        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
    else:
        G.from_cudf_edgelist(cu_M, source="0", destination="1")

    # cugraph Jaccard Call
    t1 = time.time()
    df = cugraph.jaccard(G)
    t2 = time.time() - t1
    print("Time : " + str(t2))

    df = df.sort_values(["source", "destination"]).reset_index(drop=True)

    return (
        df["source"].to_array(),
        df["destination"].to_array(),
        df["jaccard_coeff"].to_array(),
    )
Example #14
0
def test_jaccard_two_hop(graph_file):
    M = read_mtx_file(graph_file)
    M = M.tocsr()
    Gnx = nx.DiGraph(M).to_undirected()
    G = cugraph.Graph()
    row_offsets = cudf.Series(M.indptr)
    col_indices = cudf.Series(M.indices)
    G.add_adj_list(row_offsets, col_indices, None)
    pairs = G.get_two_hop_neighbors()
    nx_pairs = []
    for i in range(len(pairs)):
        nx_pairs.append((pairs['first'][i], pairs['second'][i]))
    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
    nx_coeff = []
    for u, v, p in preds:
        nx_coeff.append(p)
    df = cugraph.jaccard(G, pairs['first'], pairs['second'])
    assert len(nx_coeff) == len(df)
    for i in range(len(df)):
        diff = abs(nx_coeff[i] - df['jaccard_coeff'][i])
        assert diff < 1.0e-6
Example #15
0
def cugraph_call(cu_M, edgevals=False):
    '''M = M.tocsr()
    if M is None:
        raise TypeError('Could not read the input graph')
    if M.shape[0] != M.shape[1]:
        raise TypeError('Shape is not square')
    '''
    G = cugraph.DiGraph()
    if edgevals is True:
        G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2')
    else:
        G.from_cudf_edgelist(cu_M, source='0', target='1')

    # cugraph Jaccard Call
    t1 = time.time()
    df = cugraph.jaccard(G)
    t2 = time.time() - t1
    print('Time : '+str(t2))

    return df['source'].to_array(), df['destination'].to_array(),\
        df['jaccard_coeff'].to_array()
Example #16
0
def cluster(
    X,
    n_neighbors=30,
    community="louvain",
    metric="euclidean",
    algorithm="brute",
    similarity="jaccard",
    min_size=10,
    distributed=False,
):
    """
    Clusters

    Parameters
    ----------
    X : cudf.DataFrame
        Input cell-by-feature dataframe.
    n_neighbors : int
        Number of neighbors for kNN.
    community: string
        Community detection algorithm to use.
        Deault is 'louvain'.
    metric: string
        Distance metric to use for kNN.
        Currently, only 'euclidean' is supported.
    algorithm: string
        The query algorithm to use.
        Currently, only 'brute' is supported.
    similarity: string
        Similarity metric to use for neighbor edge refinement.
        Default is 'jaccard'.
    min_size: int
        Minimum cluster size.
    distributed: bool
        If True, use a multi-GPU dask cluster for kNN search.
    Returns
    -------
    communities: cudf.DataFrame
        Community labels.
    G: cugraph.Graph
        k-neighbors graph.
    Q: float
        Modularity score for detected communities.
        Q is not returned if community='ecg' is used.
    """

    tic = time.time()
    # Go!

    idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed)

    print(f"Neighbors computed in {time.time() - tic} seconds...")

    subtic = time.time()

    G = kneighbors_graph(idx, n_neighbors, X.shape[0])

    if similarity == "overlap":
        print("Computing overlap similarity...", flush=True)
        G = cugraph.overlap(G)

    else:
        similarity = "jaccard"
        print("Computing Jaccard similarity...", flush=True)
        G = cugraph.jaccard(G)

    print(
        f"{similarity} graph constructed in {time.time() - subtic} seconds...",
        flush=True,
    )

    g = cugraph.symmetrize_df(G, "source", "destination")
    G = cugraph.Graph()
    G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff")
    del g

    if community == "louvain":

        print("Running Louvain modularity optimization...", flush=True)

        parts, Q = cugraph.louvain(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "leiden":

        print("Running Leiden modularity optimization...", flush=True)

        parts, Q = cugraph.leiden(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "ecg":

        print("Running ECG...", flush=True)
        parts = cugraph.ecg(G)
        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)

        return communities, G, None

    # Insert any community/clustering method...
    elif community == "your favorite method":
        pass
def jaccard_baseline_test(G, sources, destinations, labels, num_positive):
    results = cugraph.jaccard(G, first=sources,
                              second=destinations).to_pandas().dropna()
    top_n = results.nlargest(num_positive, "jaccard_coeff")
    print("Jaccard", calculate_f1_score(top_n, sources, destinations, labels))