Ejemplo n.º 1
0
def cugraph_call(G, min_weight, ensemble_size):
    df = cugraph.ecg(G, min_weight, ensemble_size)
    num_parts = df["partition"].max() + 1
    score = cugraph.analyzeClustering_modularity(G, num_parts, df,
                                                 'vertex', 'partition')

    return score, num_parts
Ejemplo n.º 2
0
def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size):
    gc.collect()

    # Read in the graph and get a NetworkX graph
    M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
    G = nx.from_pandas_edgelist(
        M, source="0", target="1", edge_attr="weight",
        create_using=nx.Graph()
    )

    # Get the modularity score for partitioning versus random assignment
    _ = cugraph.ecg(G, min_weight, ensemble_size, "weight")
Ejemplo n.º 3
0
def cugraph_call(G, min_weight, ensemble_size):
    df = cugraph.ecg(G, min_weight, ensemble_size)
    df = df.sort_values("vertex")
    num_parts = df["partition"].max() + 1
    score = cugraph.analyzeClustering_modularity(G, num_parts, df["partition"])
    return score, num_parts
Ejemplo n.º 4
0
def cluster(
    X,
    n_neighbors=30,
    community="louvain",
    metric="euclidean",
    algorithm="brute",
    similarity="jaccard",
    min_size=10,
    distributed=False,
):
    """
    Clusters

    Parameters
    ----------
    X : cudf.DataFrame
        Input cell-by-feature dataframe.
    n_neighbors : int
        Number of neighbors for kNN.
    community: string
        Community detection algorithm to use.
        Deault is 'louvain'.
    metric: string
        Distance metric to use for kNN.
        Currently, only 'euclidean' is supported.
    algorithm: string
        The query algorithm to use.
        Currently, only 'brute' is supported.
    similarity: string
        Similarity metric to use for neighbor edge refinement.
        Default is 'jaccard'.
    min_size: int
        Minimum cluster size.
    distributed: bool
        If True, use a multi-GPU dask cluster for kNN search.
    Returns
    -------
    communities: cudf.DataFrame
        Community labels.
    G: cugraph.Graph
        k-neighbors graph.
    Q: float
        Modularity score for detected communities.
        Q is not returned if community='ecg' is used.
    """

    tic = time.time()
    # Go!

    idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed)

    print(f"Neighbors computed in {time.time() - tic} seconds...")

    subtic = time.time()

    G = kneighbors_graph(idx, n_neighbors, X.shape[0])

    if similarity == "overlap":
        print("Computing overlap similarity...", flush=True)
        G = cugraph.overlap(G)

    else:
        similarity = "jaccard"
        print("Computing Jaccard similarity...", flush=True)
        G = cugraph.jaccard(G)

    print(
        f"{similarity} graph constructed in {time.time() - subtic} seconds...",
        flush=True,
    )

    g = cugraph.symmetrize_df(G, "source", "destination")
    G = cugraph.Graph()
    G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff")
    del g

    if community == "louvain":

        print("Running Louvain modularity optimization...", flush=True)

        parts, Q = cugraph.louvain(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "leiden":

        print("Running Leiden modularity optimization...", flush=True)

        parts, Q = cugraph.leiden(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "ecg":

        print("Running ECG...", flush=True)
        parts = cugraph.ecg(G)
        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)

        return communities, G, None

    # Insert any community/clustering method...
    elif community == "your favorite method":
        pass