Exemple #1
0
def test_overlap_multi_column(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    G1 = cugraph.Graph()
    G1.from_cudf_edgelist(cu_M,
                          source=["src_0", "src_1"],
                          destination=["dst_0", "dst_1"])

    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
    vertex_pair = vertex_pair[:5]

    df_res = cugraph.overlap(G1, vertex_pair)

    G2 = cugraph.Graph()
    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
    df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]])

    # Calculating mismatch
    assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"])
Exemple #2
0
def test_overlap_multi_column(graph_file):

    M = utils.read_csv_for_nx(graph_file)

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    G1 = cugraph.Graph()
    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
                          destination=["dst_0", "dst_1"])

    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
    vertex_pair = vertex_pair[:5]

    df_res = cugraph.overlap(G1, vertex_pair)

    G2 = cugraph.Graph()
    G2.from_cudf_edgelist(cu_M, source="src_0",
                          destination="dst_0")
    df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]])

    # Calculating mismatch
    actual = df_res.sort_values("0_source").reset_index()
    expected = df_exp.sort_values("source").reset_index()
    assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
Exemple #3
0
def cugraph_call(cu_M, pairs, edgevals=False):
    G = cugraph.DiGraph()
    # Device data
    if edgevals is True:
        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
    else:
        G.from_cudf_edgelist(cu_M, source="0", destination="1")
    # cugraph Overlap Call
    t1 = time.time()
    df = cugraph.overlap(G, pairs)
    t2 = time.time() - t1
    print("Time : " + str(t2))
    df = df.sort_values(by=["source", "destination"])
    return df["overlap_coeff"].to_array()
Exemple #4
0
def cugraph_call(cu_M, pairs, edgevals=False):
    G = cugraph.DiGraph()
    # Device data
    if edgevals is True:
        G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2')
    else:
        G.from_cudf_edgelist(cu_M, source='0', destination='1')
    # cugraph Overlap Call
    t1 = time.time()
    df = cugraph.overlap(G, pairs)
    t2 = time.time() - t1
    print('Time : ' + str(t2))
    df = df.sort_values(by=['source', 'destination'])
    return df['overlap_coeff'].to_array()
Exemple #5
0
def cugraph_call(cu_M, first, second, edgevals=False):
    G = cugraph.DiGraph()
    # Device data
    if edgevals is True:
        G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2')
    else:
        G.from_cudf_edgelist(cu_M, source='0', target='1')
    # cugraph Overlap Call
    t1 = time.time()
    df = cugraph.overlap(G, first, second)
    t2 = time.time() - t1
    print('Time : ' + str(t2))

    return df['overlap_coeff'].to_array()
Exemple #6
0
def cugraph_call(cu_M, first, second, edgevals=False):
    # Device data
    sources = cu_M['0']
    destinations = cu_M['1']
    if edgevals is False:
        values = None
    else:
        values = cu_M['2']

    G = cugraph.Graph()
    G.add_edge_list(sources, destinations, values)

    # cugraph Overlap Call
    t1 = time.time()
    df = cugraph.overlap(G, first, second)
    t2 = time.time() - t1
    print('Time : ' + str(t2))

    return df['overlap_coeff'].to_array()
def overlap_baseline_test(G, sources, destinations, labels, num_positive):
    results = cugraph.overlap(G, first=sources,
                              second=destinations).to_pandas().dropna()
    top_n = results.nlargest(num_positive, "overlap_coeff")
    print("Overlap", calculate_f1_score(top_n, sources, destinations, labels))
Exemple #8
0
def cluster(
    X,
    n_neighbors=30,
    community="louvain",
    metric="euclidean",
    algorithm="brute",
    similarity="jaccard",
    min_size=10,
    distributed=False,
):
    """
    Clusters

    Parameters
    ----------
    X : cudf.DataFrame
        Input cell-by-feature dataframe.
    n_neighbors : int
        Number of neighbors for kNN.
    community: string
        Community detection algorithm to use.
        Deault is 'louvain'.
    metric: string
        Distance metric to use for kNN.
        Currently, only 'euclidean' is supported.
    algorithm: string
        The query algorithm to use.
        Currently, only 'brute' is supported.
    similarity: string
        Similarity metric to use for neighbor edge refinement.
        Default is 'jaccard'.
    min_size: int
        Minimum cluster size.
    distributed: bool
        If True, use a multi-GPU dask cluster for kNN search.
    Returns
    -------
    communities: cudf.DataFrame
        Community labels.
    G: cugraph.Graph
        k-neighbors graph.
    Q: float
        Modularity score for detected communities.
        Q is not returned if community='ecg' is used.
    """

    tic = time.time()
    # Go!

    idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed)

    print(f"Neighbors computed in {time.time() - tic} seconds...")

    subtic = time.time()

    G = kneighbors_graph(idx, n_neighbors, X.shape[0])

    if similarity == "overlap":
        print("Computing overlap similarity...", flush=True)
        G = cugraph.overlap(G)

    else:
        similarity = "jaccard"
        print("Computing Jaccard similarity...", flush=True)
        G = cugraph.jaccard(G)

    print(
        f"{similarity} graph constructed in {time.time() - subtic} seconds...",
        flush=True,
    )

    g = cugraph.symmetrize_df(G, "source", "destination")
    G = cugraph.Graph()
    G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff")
    del g

    if community == "louvain":

        print("Running Louvain modularity optimization...", flush=True)

        parts, Q = cugraph.louvain(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "leiden":

        print("Running Leiden modularity optimization...", flush=True)

        parts, Q = cugraph.leiden(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "ecg":

        print("Running ECG...", flush=True)
        parts = cugraph.ecg(G)
        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)

        return communities, G, None

    # Insert any community/clustering method...
    elif community == "your favorite method":
        pass