def cugraph_louvain(G, edgevals=False): # cugraph Louvain Call t1 = time.time() parts, mod = cugraph.louvain(G) t2 = time.time() - t1 print("Cugraph Louvain Time : " + str(t2)) return parts, mod
def louvain_partition(G, as_dict=False): W = timestep_cache().W nodes_cluster, score = cugraph.louvain(W) nodes = np.array(list(nodes_cluster.keys())) off_nodes = np.array(G.nodes) clusters = np.array(list(nodes_cluster.values())) s_clusters = ordered(off_nodes, nodes, clusters) # Need to sort the cluster cause the order is not keeped from louvain algo if as_dict: return dict(zip(off_nodes, s_clusters)) return s_clusters
def cugraph_louvain(cu_M, edgevals=False): G = cugraph.Graph() if edgevals: G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") else: G.from_cudf_edgelist(cu_M, source="0", destination="1") # cugraph Louvain Call t1 = time.time() parts, mod = cugraph.louvain(G) t2 = time.time() - t1 print("Cugraph Time : " + str(t2)) return parts, mod
def cugraph_call(cu_M, edgevals=False): G = cugraph.Graph() if edgevals: G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', destination='1') # cugraph Louvain Call t1 = time.time() parts, mod = cugraph.louvain(G) t2 = time.time() - t1 print('Cugraph Time : ' + str(t2)) return parts, mod
def cugraph_louvain_community(graph: CuGraph) -> Tuple[CuDFNodeMap, float]: label_df, modularity_score = cugraph.louvain(graph.value) label_series = label_df.set_index("vertex")["partition"] orphan_mask: cupy.ndarray = ~graph.nodes.index.isin(label_series.index) orphan_nodes = graph.nodes.index[orphan_mask] orphan_count = orphan_mask.astype(int).sum().item() max_label = label_df.index.max() orphan_labels = cupy.arange(max_label + 1, max_label + 1 + orphan_count) orphan_series = cudf.Series(orphan_labels, index=orphan_nodes) label_series = cudf.concat([label_series, orphan_series]) # TODO more orphan should worsen the modularity score but do not in this implementation return ( CuDFNodeMap(label_series), modularity_score, )
def cugraph_call(cu_M, edgevals=False): # Device data sources = cu_M['0'] destinations = cu_M['1'] if edgevals: values = cu_M['2'] else: values = None G = cugraph.Graph() G.add_edge_list(sources, destinations, values) # cugraph Louvain Call t1 = time.time() parts, mod = cugraph.louvain(G) t2 = time.time() - t1 print('Time : '+str(t2)) return parts, mod
def apply(model, df, param): src_dest_name = param['feature_variables'] dfg = df[src_dest_name] gdf = cudf.DataFrame(dfg) # create graph G = cugraph.Graph() G.from_cudf_edgelist(gdf, source='src', destination='dest', renumber=True) max_iter = 100 if 'max_iter' in param['options']['params']: max_iter = int(param['options']['params']['max_iter']) # cugraph Louvain Call dfr, mod = cugraph.louvain(G) dfr = dfr.to_pandas().rename(columns={"vertex": src_dest_name[0]}) df = df.join(dfr.set_index(src_dest_name[0]), on=src_dest_name[0]) df = df.rename(columns={"partition": src_dest_name[0] + "_partition"}) dfr = dfr.rename(columns={src_dest_name[0]: src_dest_name[1]}) df = df.join(dfr.set_index(src_dest_name[1]), on=src_dest_name[1]) df = df.rename(columns={"partition": src_dest_name[1] + "_partition"}) model['louvain_modularity'] = mod return df
def louvain(G): return cugraph.louvain(G)
def louvain( adata: AnnData, resolution: Optional[float] = None, random_state: _utils.AnyRandom = 0, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, key_added: str = 'louvain', adjacency: Optional[spmatrix] = None, flavor: Literal['vtraag', 'igraph', 'rapids'] = 'vtraag', directed: bool = True, use_weights: bool = False, partition_type: Optional[Type[MutableVertexPartition]] = None, partition_kwargs: Mapping[str, Any] = MappingProxyType({}), neighbors_key: Optional[str] = None, obsp: Optional[str] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first, or explicitly passing a ``adjacency`` matrix. Parameters ---------- adata The annotated data matrix. resolution For the default flavor (``'vtraag'``) or for ```RAPIDS```, you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain ``(obs_key, list_of_categories)``. key_added Key under which to add the cluster labels. (default: ``'louvain'``) adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. flavor Choose between to packages for computing the clustering. ``'vtraag'`` is much more powerful, and the default. directed Interpret the ``adjacency`` matrix as directed graph? use_weights Use weights from knn graph. partition_type Type of partition to use. Only a valid argument if ``flavor`` is ``'vtraag'``. partition_kwargs Key word arguments to pass to partitioning, if ``vtraag`` method is being used. neighbors_key Use neighbors connectivities as adjacency. If not specified, louvain looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, louvain looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. obsp Use .obsp[obsp] as adjacency. You can't specify both `obsp` and `neighbors_key` at the same time. copy Copy adata or modify it inplace. Returns ------- :obj:`None` By default (``copy=False``), updates ``adata`` with the following fields: ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``) Array of dim (number of samples) that stores the subgroup id (``'0'``, ``'1'``, ...) for each cell. :class:`~anndata.AnnData` When ``copy=True`` is set, a copy of ``adata`` with those fields is returned. """ partition_kwargs = dict(partition_kwargs) start = logg.info('running Louvain clustering') if (flavor != 'vtraag') and (partition_type is not None): raise ValueError('`partition_type` is only a valid argument ' 'when `flavour` is "vtraag"') adata = adata.copy() if copy else adata if adjacency is None: adjacency = _choose_graph(adata, obsp, neighbors_key) if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warning( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.debug(' using the undirected graph') g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) if use_weights: weights = np.array(g.es["weight"]).astype(np.float64) else: weights = None if flavor == 'vtraag': import louvain if partition_type is None: partition_type = louvain.RBConfigurationVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights if version.parse(louvain.__version__) < version.parse("0.7.0"): louvain.set_rng_seed(random_state) else: partition_kwargs["seed"] = random_state logg.info(' using the "louvain" package of Traag (2017)') part = louvain.find_partition( g, partition_type, **partition_kwargs, ) # adata.uns['louvain_quality'] = part.quality() else: part = g.community_multilevel(weights=weights) groups = np.array(part.membership) elif flavor == 'rapids': # nvLouvain only works with undirected graphs, # and `adjacency` must have a directed edge in both directions import cudf import cugraph offsets = cudf.Series(adjacency.indptr) indices = cudf.Series(adjacency.indices) if use_weights: sources, targets = adjacency.nonzero() weights = adjacency[sources, targets] if isinstance(weights, np.matrix): weights = weights.A1 weights = cudf.Series(weights) else: weights = None g = cugraph.Graph() if hasattr(g, 'add_adj_list'): g.add_adj_list(offsets, indices, weights) else: g.from_cudf_adjlist(offsets, indices, weights) logg.info(' using the "louvain" package of rapids') if resolution is not None: louvain_parts, _ = cugraph.louvain(g, resolution=resolution) else: louvain_parts, _ = cugraph.louvain(g) groups = (louvain_parts.to_pandas().sort_values('vertex')[[ 'partition' ]].to_numpy().ravel()) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') if restrict_to is not None: if key_added == 'louvain': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(map(str, np.unique(groups))), ) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = dict( resolution=resolution, random_state=random_state, ) logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adata if copy else None
XYZ_C = cd.read_csv('HW_AI/HW_Final/dataset/pos_50.csv', index_col = 0).to_numpy() XYZ_Edges = cd.read_csv('HW_AI/HW_Final/dataset/edges_50.csv', index_col = 0,dtype=['int32', 'int32', 'int32','float32','str']) read_file_endtime = datetime.datetime.now() print (read_file_endtime - read_file_starttime) graph_build_starttime = datetime.datetime.now() G = cg.Graph() G = cg.from_cudf_edgelist(XYZ_Edges, source = 'Source', destination = 'Target', edge_attr = 'Weight') graph_build_endtime = datetime.datetime.now() print (graph_build_endtime - graph_build_starttime) louvain_starttime = datetime.datetime.now() result, mod = cg.louvain(G) vertex = result['vertex'] partition = result['partition'] size = result['partition'].max() + 1 print('community', size) print('modularity', mod) vertex = cp.fromDlpack(vertex.to_dlpack()) partition = cp.fromDlpack(partition.to_dlpack()) vertex = cp.reshape(vertex, XYZ_C.shape[0]) labelRE = cp.reshape(partition, XYZ_C.shape[0]) index = cp.argsort(vertex) vertex = cp.take_along_axis(vertex, index, axis=0) labelRE = cp.take_along_axis(labelRE, index, axis=0) print(result) print(vertex)
def cluster( data: Union[np.ndarray, spmatrix], clustering_algo: Union["louvain", "louvain-gpu", "leiden", "leiden-igraph"] = "louvain", k: int = 30, directed: bool = False, prune: bool = False, min_cluster_size: int = 10, jaccard: bool = True, primary_metric: Union["euclidean", "manhattan", "correlation", "cosine"] = "euclidean", n_jobs: int = -1, q_tol: float = 1e-3, louvain_time_limit: int = 2000, nn_method: Union["kdtree", "brute"] = "kdtree", partition_type: Optional[Type[MutableVertexPartition]] = None, resolution_parameter: float = 1, n_iterations: int = -1, use_weights: bool = True, seed: Optional[int] = None, use_gpu: bool = False, **kargs, ) -> Tuple[np.array, spmatrix, float]: """\ PhenoGraph clustering Parameters ---------- data Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph. If ndarray, n-by-d array of n cells in d dimensions. If sparse matrix, n-by-n adjacency matrix. clustering_algo Choose `'louvain'` or `'leiden'`. Any other value will return only graph object. k Number of nearest neighbors to use in first step of graph construction. directed Whether to use a symmetric (default) or asymmetric ("directed") graph. The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below). prune Whether to symmetrize by taking the average (prune = False) or product (prune = True) between the graph and its transpose. min_cluster_size Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels. jaccard If True, use Jaccard metric between k-neighborhoods to build graph. If False, use a Gaussian kernel. primary_metric Distance metric to define nearest neighbors. Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'}. Note that performance will be slower for `correlation` and `cosine`. n_jobs Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If 1 is given, no parallelism is used. If set to -1, all CPUs are used. For n_jobs below -1, `n_cpus + 1 + n_jobs` are used. q_tol Tolerance (i.e., precision) for monitoring modularity optimization louvain_time_limit Maximum number of seconds to run modularity optimization. If exceeded the best result so far is returned. nn_method Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional data sets, brute force (with parallel computation) performs faster than kdtree. partition_type Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. resolution_parameter A parameter value controlling the coarseness of the clustering in Leiden. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. n_iterations Number of iterations to run the Leiden algorithm. If the number of iterations is negative, the Leiden algorithm is run until an iteration in which there was no improvement. use_weights Use vertices in the Leiden computation. seed Leiden initialization of the optimization. kargs Additional arguments passed to :func:`~leidenalg.find_partition` and the constructor of the `partition_type`. use_gpu Whether to use GPU to calculate distance. Now only support euclidean and inner product metrics. Returns ------- communities numpy integer array of community assignments for each row in data. graph numpy sparse array of the graph that was used for clustering. Q the modularity score for communities on graph. Example ------- >>> import phenograph >>> import scipy.sparse >>> import numpy as np >>> N = 5000 >>> K = 30 >>> RowInd = np.repeat(np.arange(N), K) >>> ColInd = np.tile(np.arange(N), K) >>> Mat = scipy.sparse.csr_matrix( ... (np.ones(ColInd.shape), (RowInd, ColInd)), shape=(N, N) ... ) >>> communities, graph, Q = phenograph.cluster(Mat, clustering_algo = 'leiden') """ # NB if prune=True, graph must be undirected, and the prune setting takes precedence if prune: print("Setting directed=False because prune=True") directed = False if n_jobs == 1: kernel = jaccard_kernel else: kernel = parallel_jaccard_kernel kernelargs = {} data = data.astype(np.float32) if not data.flags.contiguous: data = np.ascontiguousarray( data) # faiss must use contiguous array and float32! # Start timer tic = time.time() uid = uuid.uuid1().hex # Go! if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]: print( "Using neighbor information from provided graph, " "rather than computing neighbors directly", flush=True, ) lilmatrix = data.tolil() d = np.vstack(lilmatrix.data).astype("float32") # distances idx = np.vstack(lilmatrix.rows).astype( "int32") # neighbor indices by row del lilmatrix assert idx.shape[0] == data.shape[0] else: d, idx = find_neighbors( data, k=k, metric=primary_metric, method=nn_method, n_jobs=n_jobs, use_gpu=use_gpu, ) # for debugging or manually running pickle.dump(d, open(uid + ".nneigh.d.pickle", "wb")) pickle.dump(idx, open(uid + ".nneigh.idx.pickle", "wb")) print("Neighbors computed in {} seconds".format(time.time() - tic), flush=True) subtic = time.time() kernelargs["idx"] = idx # if not using jaccard kernel, use gaussian if not jaccard: kernelargs["d"] = d kernelargs["sigma"] = 1.0 kernel = gaussian_kernel graph = neighbor_graph(kernel, kernelargs) print( "Gaussian kernel graph constructed in {} seconds".format( time.time() - subtic), flush=True, ) else: del d graph = neighbor_graph(kernel, kernelargs) print( "Jaccard graph constructed in {} seconds".format(time.time() - subtic), flush=True, ) if not directed: if not prune: # symmetrize graph by averaging with transpose sg = (graph + graph.transpose()).multiply(0.5) else: # symmetrize graph by multiplying with transpose sg = graph.multiply(graph.transpose()) # retain lower triangle (for efficiency) graph = sp.tril(sg, -1) # write to file with unique id graph2binary(uid, graph) # choose between Louvain or Leiden algorithm communities, Q = "", "" if clustering_algo == "louvain": communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit) print("Sorting communities by size, please wait ...", flush=True) communities = sort_by_size(communities, min_cluster_size) print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) elif "leiden" in clustering_algo: # convert resulting graph from scipy.sparse.coo.coo_matrix to Graph object # get indices of vertices edgelist = np.vstack(graph.nonzero()).T.tolist() g = ig.Graph(max(graph.shape), edgelist, directed=directed) # set vertices as weights g.es["weights"] = graph.data if not partition_type: partition_type = leidenalg.RBConfigurationVertexPartition if resolution_parameter: kargs["resolution_parameter"] = resolution_parameter if use_weights: kargs["weights"] = np.array(g.es["weights"]).astype("float64") kargs["n_iterations"] = n_iterations print("Running Leiden optimization", flush=True) tic_ = time.time() if clustering_algo == "leiden": communities = leidenalg.find_partition( g, partition_type=partition_type, **kargs, seed=seed, ) elif clustering_algo == "leiden-igraph": communities = g.community_leiden( objective_function="modularity", **kargs, ) Q = communities.q print( "Leiden completed in {} seconds".format(time.time() - tic_), flush=True, ) communities = np.asarray(communities.membership) print("Sorting communities by size, please wait ...", flush=True) communities = sort_by_size(communities, min_cluster_size) print("PhenoGraph completed in {} seconds".format(time.time() - tic), flush=True) elif clustering_algo == "louvain-gpu": tic_ = time.time() # convert resulting graph from scipy.sparse.coo.coo_matrix to Graph object # get indices of vertices edgelist = np.vstack(graph.nonzero()).T g = cugraph.Graph(symmetrized=True) g.add_edge_list(edgelist[:, 0], edgelist[:, 1], graph.data) parts, Q = cugraph.louvain(g, n_iterations) communities = parts.as_matrix()[:, 1] print( "Louvain-GPU completed in {} seconds".format(time.time() - tic_), flush=True, ) print("Sorting communities by size, please wait ...", flush=True) communities = sort_by_size(communities, min_cluster_size) print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True) else: # return only graph object pass print(pd.DataFrame(data=communities)[0].value_counts()) return communities, graph, Q
def cluster( X, n_neighbors=30, community="louvain", metric="euclidean", algorithm="brute", similarity="jaccard", min_size=10, distributed=False, ): """ Clusters Parameters ---------- X : cudf.DataFrame Input cell-by-feature dataframe. n_neighbors : int Number of neighbors for kNN. community: string Community detection algorithm to use. Deault is 'louvain'. metric: string Distance metric to use for kNN. Currently, only 'euclidean' is supported. algorithm: string The query algorithm to use. Currently, only 'brute' is supported. similarity: string Similarity metric to use for neighbor edge refinement. Default is 'jaccard'. min_size: int Minimum cluster size. distributed: bool If True, use a multi-GPU dask cluster for kNN search. Returns ------- communities: cudf.DataFrame Community labels. G: cugraph.Graph k-neighbors graph. Q: float Modularity score for detected communities. Q is not returned if community='ecg' is used. """ tic = time.time() # Go! idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed) print(f"Neighbors computed in {time.time() - tic} seconds...") subtic = time.time() G = kneighbors_graph(idx, n_neighbors, X.shape[0]) if similarity == "overlap": print("Computing overlap similarity...", flush=True) G = cugraph.overlap(G) else: similarity = "jaccard" print("Computing Jaccard similarity...", flush=True) G = cugraph.jaccard(G) print( f"{similarity} graph constructed in {time.time() - subtic} seconds...", flush=True, ) g = cugraph.symmetrize_df(G, "source", "destination") G = cugraph.Graph() G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff") del g if community == "louvain": print("Running Louvain modularity optimization...", flush=True) parts, Q = cugraph.louvain(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "leiden": print("Running Leiden modularity optimization...", flush=True) parts, Q = cugraph.leiden(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "ecg": print("Running ECG...", flush=True) parts = cugraph.ecg(G) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) return communities, G, None # Insert any community/clustering method... elif community == "your favorite method": pass