def cugraph_call(G, min_weight, ensemble_size): df = cugraph.ecg(G, min_weight, ensemble_size) num_parts = df["partition"].max() + 1 score = cugraph.analyzeClustering_modularity(G, num_parts, df, 'vertex', 'partition') return score, num_parts
def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size): gc.collect() # Read in the graph and get a NetworkX graph M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) G = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) # Get the modularity score for partitioning versus random assignment _ = cugraph.ecg(G, min_weight, ensemble_size, "weight")
def cugraph_call(G, min_weight, ensemble_size): df = cugraph.ecg(G, min_weight, ensemble_size) df = df.sort_values("vertex") num_parts = df["partition"].max() + 1 score = cugraph.analyzeClustering_modularity(G, num_parts, df["partition"]) return score, num_parts
def cluster( X, n_neighbors=30, community="louvain", metric="euclidean", algorithm="brute", similarity="jaccard", min_size=10, distributed=False, ): """ Clusters Parameters ---------- X : cudf.DataFrame Input cell-by-feature dataframe. n_neighbors : int Number of neighbors for kNN. community: string Community detection algorithm to use. Deault is 'louvain'. metric: string Distance metric to use for kNN. Currently, only 'euclidean' is supported. algorithm: string The query algorithm to use. Currently, only 'brute' is supported. similarity: string Similarity metric to use for neighbor edge refinement. Default is 'jaccard'. min_size: int Minimum cluster size. distributed: bool If True, use a multi-GPU dask cluster for kNN search. Returns ------- communities: cudf.DataFrame Community labels. G: cugraph.Graph k-neighbors graph. Q: float Modularity score for detected communities. Q is not returned if community='ecg' is used. """ tic = time.time() # Go! idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed) print(f"Neighbors computed in {time.time() - tic} seconds...") subtic = time.time() G = kneighbors_graph(idx, n_neighbors, X.shape[0]) if similarity == "overlap": print("Computing overlap similarity...", flush=True) G = cugraph.overlap(G) else: similarity = "jaccard" print("Computing Jaccard similarity...", flush=True) G = cugraph.jaccard(G) print( f"{similarity} graph constructed in {time.time() - subtic} seconds...", flush=True, ) g = cugraph.symmetrize_df(G, "source", "destination") G = cugraph.Graph() G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff") del g if community == "louvain": print("Running Louvain modularity optimization...", flush=True) parts, Q = cugraph.louvain(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "leiden": print("Running Leiden modularity optimization...", flush=True) parts, Q = cugraph.leiden(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "ecg": print("Running ECG...", flush=True) parts = cugraph.ecg(G) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) return communities, G, None # Insert any community/clustering method... elif community == "your favorite method": pass