def run_louvain( graph: sp.coo_matrix, q_tol: float, louvain_time_limit: int ) -> Tuple[np.ndarray, float]: """ Wrapper for Louvain community detection Args: graph (sp.coo_matrix): See below in 'cluster()' q_tol (float): See below in 'cluster()' louvain_time_limit (int): See below in 'cluster()' Returns: communities, Q (Tuple[np.ndarray, float]): See below in 'cluster()' """ # write to file with unique id uid = uuid.uuid1().hex graph2binary(uid, graph) communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) return communities, Q
def cluster(data, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True, primary_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000, nn_method='kdtree', verbosity=2): """ PhenoGraph clustering :param data: Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph If ndarray, n-by-d array of n cells in d dimensions If sparse matrix, n-by-n adjacency matrix :param k: Number of nearest neighbors to use in first step of graph construction :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below) :param prune: Whether to symmetrize by taking the average (prune=False) or product (prune=True) between the graph and its transpose :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph. If False, use a Gaussian kernel. :param primary_metric: Distance metric to define nearest neighbors. Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'} Note that performance will be slower for correlation and cosine. :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1, the number of jobs is determined automatically :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization :param louvain_time_limit: Maximum number of seconds to run modularity optimization. If exceeded the best result so far is returned :param nn_method: Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional data sets, brute force (with parallel computation) performs faster than kdtree. :param verbosity: How much text output to produce. Higher values produce more output. Zero should silence all output including warnings, so use with caution. :return communities: numpy integer array of community assignments for each row in data :return graph: numpy sparse array of the graph that was used for clustering :return Q: the modularity score for communities on graph """ logger.setLevel(max([logging.ERROR - verbosity * 10, logging.DEBUG])) # NB if prune=True, graph must be undirected, and the prune setting takes precedence if prune and directed: logger.warning("Setting directed=False because prune=True") directed = False if n_jobs == 1: kernel = jaccard_kernel else: kernel = parallel_jaccard_kernel kernelargs = {} # Start timer tic = time.time() # Go! if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]: logger.info( "Using neighbor information from provided graph, rather than computing " + "neighbors directly") lilmatrix = data.tolil() d = np.vstack(lilmatrix.data).astype('float32') # distances idx = np.vstack(lilmatrix.rows).astype( 'int32') # neighbor indices by row del lilmatrix assert idx.shape[0] == data.shape[0] k = idx.shape[1] else: d, idx = find_neighbors(data, k=k, metric=primary_metric, method=nn_method, n_jobs=n_jobs) logger.info("Neighbors computed in {} seconds".format(time.time() - tic)) subtic = time.time() kernelargs['idx'] = idx # if not using jaccard kernel, use gaussian if not jaccard: kernelargs['d'] = d kernelargs['sigma'] = 1. kernel = gaussian_kernel graph = neighbor_graph(kernel, kernelargs) logger.info("Gaussian kernel graph constructed in {} seconds".format( time.time() - subtic)) else: del d graph = neighbor_graph(kernel, kernelargs) logger.info( "Jaccard graph constructed in {} seconds".format(time.time() - subtic)) if not directed: if not prune: # symmetrize graph by averaging with transpose sg = (graph + graph.transpose()).multiply(.5) else: # symmetrize graph by multiplying with transpose sg = graph.multiply(graph.transpose()) # retain lower triangle (for efficiency) graph = sp.tril(sg, -1) # write to file with unique id uid = uuid.uuid1().hex graph2binary(uid, graph) communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit) logger.info("PhenoGraph complete in {} seconds".format(time.time() - tic)) communities = sort_by_size(communities, min_cluster_size) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) return communities, graph, Q
def cluster(data, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True, primary_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000, nn_method='kdtree'): """ PhenoGraph clustering :param data: Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph If ndarray, n-by-d array of n cells in d dimensions If sparse matrix, n-by-n adjacency matrix :param k: Number of nearest neighbors to use in first step of graph construction :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below) :param prune: Whether to symmetrize by taking the average (prune=False) or product (prune=True) between the graph and its transpose :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph. If False, use a Gaussian kernel. :param primary_metric: Distance metric to define nearest neighbors. Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'} Note that performance will be slower for correlation and cosine. :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1, the number of jobs is determined automatically :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization :param louvain_time_limit: Maximum number of seconds to run modularity optimization. If exceeded the best result so far is returned :param nn_method: Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional data sets, brute force (with parallel computation) performs faster than kdtree. :return communities: numpy integer array of community assignments for each row in data :return graph: numpy sparse array of the graph that was used for clustering :return Q: the modularity score for communities on graph """ # NB if prune=True, graph must be undirected, and the prune setting takes precedence if prune: print("Setting directed=False because prune=True") directed = False if n_jobs == 1: kernel = jaccard_kernel else: kernel = parallel_jaccard_kernel kernelargs = {} # Start timer tic = time.time() # Go! if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]: print("Using neighbor information from provided graph, rather than computing neighbors directly", flush=True) lilmatrix = data.tolil() d = np.vstack(lilmatrix.data).astype('float32') # distances idx = np.vstack(lilmatrix.rows).astype('int32') # neighbor indices by row del lilmatrix assert idx.shape[0] == data.shape[0] k = idx.shape[1] else: d, idx = find_neighbors(data, k=k, metric=primary_metric, method=nn_method, n_jobs=n_jobs) print("Neighbors computed in {} seconds".format(time.time() - tic), flush=True) subtic = time.time() kernelargs['idx'] = idx # if not using jaccard kernel, use gaussian if not jaccard: kernelargs['d'] = d kernelargs['sigma'] = 1. kernel = gaussian_kernel graph = neighbor_graph(kernel, kernelargs) print("Gaussian kernel graph constructed in {} seconds".format(time.time() - subtic), flush=True) else: del d graph = neighbor_graph(kernel, kernelargs) print("Jaccard graph constructed in {} seconds".format(time.time() - subtic), flush=True) if not directed: if not prune: # symmetrize graph by averaging with transpose sg = (graph + graph.transpose()).multiply(.5) else: # symmetrize graph by multiplying with transpose sg = graph.multiply(graph.transpose()) # retain lower triangle (for efficiency) graph = sp.tril(sg, -1) # write to file with unique id uid = uuid.uuid1().hex graph2binary(uid, graph) communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit) print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True) communities = sort_by_size(communities, min_cluster_size) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) return communities, graph, Q
def cluster(data, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True, primary_metric='euclidean', n_jobs=-1, q_tol=1e-3): """ PhenoGraph clustering :param data: Numpy ndarray of data to cluster :param k: Number of nearest neighbors to use in first step of graph construction :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below) :param prune: Whether to symmetrize by taking the average (prune=False) or produce (prune=True) between the graph and its transpose :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph. If False, use a Gaussian kernel. :param primary_metric: Distance metric to define nearest neighbors. Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'} Note that performance will be slower for correlation and cosine. :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1, the number of jobs is determined automatically :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization :return communities: numpy integer array of community assignments for each row in data :return graph: numpy sparse array of the graph that was used for clustering :return Q: the modularity score for communities on graph """ # NB if prune=True, graph must be undirected, and the prune setting takes precedence if prune: print("Setting directed=False because prune=True") directed = False if n_jobs == 1: kernel = jaccard_kernel else: kernel = parallel_jaccard_kernel kernelargs = {} # Start timer tic = time.time() # Go! d, idx = find_neighbors(data, k=k, metric=primary_metric, n_jobs=n_jobs) print("Neighbors computed in {} seconds".format(time.time() - tic), flush=True) subtic = time.time() kernelargs['idx'] = idx # if not using jaccard kernel, use gaussian if not jaccard: kernelargs['d'] = d kernelargs['sigma'] = 1. kernel = gaussian_kernel graph = neighbor_graph(kernel, kernelargs) print("Gaussian kernel graph constructed in {} seconds".format( time.time() - subtic), flush=True) else: del d graph = neighbor_graph(kernel, kernelargs) print("Jaccard graph constructed in {} seconds".format(time.time() - subtic), flush=True) if not directed: if not prune: # symmetrize graph by averaging with transpose sg = (graph + graph.transpose()).multiply(.5) else: # symmetrize graph by multiplying with transpose sg = graph.multiply(graph.transpose()) # retain lower triangle (for efficiency) graph = sp.tril(sg, -1) # write to file with unique id uid = uuid.uuid1().hex graph2binary(uid, graph) communities, Q = runlouvain(uid, tol=q_tol) print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True) communities = sort_by_size(communities, min_cluster_size) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) return communities, graph, Q
def cluster( data: Union[np.ndarray, spmatrix], clustering_algo: Union["louvain", "leiden"] = "louvain", k: int = 30, directed: bool = False, prune: bool = False, min_cluster_size: int = 10, jaccard: bool = True, primary_metric: Union["euclidean", "manhattan", "correlation", "cosine"] = "euclidean", n_jobs: int = -1, q_tol: float = 1e-3, louvain_time_limit: int = 2000, nn_method: Union["kdtree", "brute"] = "kdtree", partition_type: Optional[Type[MutableVertexPartition]] = None, resolution_parameter: float = 1, n_iterations: int = -1, use_weights: bool = True, seed: Optional[int] = None, **kargs, ) -> Tuple[np.array, spmatrix, float]: """\ PhenoGraph clustering Parameters ---------- data Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph. If ndarray, n-by-d array of n cells in d dimensions. If sparse matrix, n-by-n adjacency matrix. clustering_algo Choose `'louvain'` or `'leiden'`. Any other value will return only graph object. k Number of nearest neighbors to use in first step of graph construction. directed Whether to use a symmetric (default) or asymmetric ("directed") graph. The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below). prune Whether to symmetrize by taking the average (prune = False) or product (prune = True) between the graph and its transpose. min_cluster_size Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels. jaccard If True, use Jaccard metric between k-neighborhoods to build graph. If False, use a Gaussian kernel. primary_metric Distance metric to define nearest neighbors. Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'}. Note that performance will be slower for `correlation` and `cosine`. n_jobs Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If 1 is given, no parallelism is used. If set to -1, all CPUs are used. For n_jobs below -1, `n_cpus + 1 + n_jobs` are used. q_tol Tolerance (i.e., precision) for monitoring modularity optimization louvain_time_limit Maximum number of seconds to run modularity optimization. If exceeded the best result so far is returned. nn_method Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional data sets, brute force (with parallel computation) performs faster than kdtree. partition_type Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. resolution_parameter A parameter value controlling the coarseness of the clustering in Leiden. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. n_iterations Number of iterations to run the Leiden algorithm. If the number of iterations is negative, the Leiden algorithm is run until an iteration in which there was no improvement. use_weights Use vertices in the Leiden computation. seed Leiden initialization of the optimization. kargs Additional arguments passed to :func:`~leidenalg.find_partition` and the constructor of the `partition_type`. Returns ------- communities numpy integer array of community assignments for each row in data. graph numpy sparse array of the graph that was used for clustering. Q the modularity score for communities on graph. Example ------- >>> import phenograph >>> import scipy.sparse >>> import numpy as np >>> N = 5000 >>> K = 30 >>> RowInd = np.repeat(np.arange(N), K) >>> ColInd = np.tile(np.arange(N), K) >>> Mat = scipy.sparse.csr_matrix( ... (np.ones(ColInd.shape), (RowInd, ColInd)), shape=(N, N) ... ) >>> communities, graph, Q = phenograph.cluster(Mat, clustering_algo = 'leiden') """ # NB if prune=True, graph must be undirected, and the prune setting takes precedence if prune: print("Setting directed=False because prune=True") directed = False if n_jobs == 1: kernel = jaccard_kernel else: kernel = parallel_jaccard_kernel kernelargs = {} # Start timer tic = time.time() # Go! if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]: print( "Using neighbor information from provided graph, " "rather than computing neighbors directly", flush=True, ) lilmatrix = data.tolil() d = np.vstack(lilmatrix.data).astype("float32") # distances idx = np.vstack(lilmatrix.rows).astype( "int32") # neighbor indices by row del lilmatrix assert idx.shape[0] == data.shape[0] else: d, idx = find_neighbors(data, k=k, metric=primary_metric, method=nn_method, n_jobs=n_jobs) print("Neighbors computed in {} seconds".format(time.time() - tic), flush=True) subtic = time.time() kernelargs["idx"] = idx # if not using jaccard kernel, use gaussian if not jaccard: kernelargs["d"] = d kernelargs["sigma"] = 1.0 kernel = gaussian_kernel graph = neighbor_graph(kernel, kernelargs) print( "Gaussian kernel graph constructed in {} seconds".format( time.time() - subtic), flush=True, ) else: del d graph = neighbor_graph(kernel, kernelargs) print( "Jaccard graph constructed in {} seconds".format(time.time() - subtic), flush=True, ) if not directed: if not prune: # symmetrize graph by averaging with transpose sg = (graph + graph.transpose()).multiply(0.5) else: # symmetrize graph by multiplying with transpose sg = graph.multiply(graph.transpose()) # retain lower triangle (for efficiency) graph = sp.tril(sg, -1) # choose between Louvain or Leiden algorithm communities, Q = "", "" if clustering_algo == "louvain": # write to file with unique id uid = uuid.uuid1().hex graph2binary(uid, graph) communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit) print("Sorting communities by size, please wait ...", flush=True) communities = sort_by_size(communities, min_cluster_size) print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) elif clustering_algo == "leiden": # convert resulting graph from scipy.sparse.coo.coo_matrix to Graph object # get indices of vertices edgelist = np.vstack(graph.nonzero()).T.tolist() g = ig.Graph(max(graph.shape), edgelist, directed=directed) # set vertices as weights g.es["weights"] = graph.data if not partition_type: partition_type = leidenalg.RBConfigurationVertexPartition if resolution_parameter: kargs["resolution_parameter"] = resolution_parameter if use_weights: kargs["weights"] = np.array(g.es["weights"]).astype("float64") kargs["n_iterations"] = n_iterations kargs["seed"] = seed print("Running Leiden optimization", flush=True) tic_ = time.time() communities = leidenalg.find_partition( g, partition_type=partition_type, **kargs, ) Q = communities.q print( "Leiden completed in {} seconds".format(time.time() - tic_), flush=True, ) communities = np.asarray(communities.membership) print("Sorting communities by size, please wait ...", flush=True) communities = sort_by_size(communities, min_cluster_size) print("PhenoGraph completed in {} seconds".format(time.time() - tic), flush=True) else: # return only graph object pass return communities, graph, Q
def cluster(data, k=30, d=None, idx=None, directed=False, prune=False, min_cluster_size=10, jaccard=True, primary_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000, nn_method='kdtree', use_gpu=False): """ PhenoGraph clustering :param data: Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph If ndarray, n-by-d array of n cells in d dimensions If sparse matrix, n-by-n adjacency matrix :param d: None or a Numpy ndarray with shape (data.shape[0], k-1), each data's (k-1) nearest neighbors' distance. If None, it would be calculated. :param idx: None or a Numpy ndarray with shape (data.shape[0], k-1), each data's (k-1) nearest neighbors' index. If None, it would be calculated. :param k: Number of nearest neighbors to use in first step of graph construction :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below) :param prune: Whether to symmetrize by taking the average (prune=False) or product (prune=True) between the graph and its transpose :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph. If False, use a Gaussian kernel. :param primary_metric: Distance metric to define nearest neighbors. Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'} Note that performance will be slower for correlation and cosine. :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1, the number of jobs is determined automatically :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization :param louvain_time_limit: Maximum number of seconds to run modularity optimization. If exceeded the best result so far is returned :param nn_method: Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional data sets, brute force (with parallel computation) performs faster than kdtree. :param use_gpu: Whether to use GPU to calculate distance. Now only support euclidean and inner product metrics. :return communities: numpy integer array of community assignments for each row in data :return graph: numpy sparse array of the graph that was used for clustering :return Q: the modularity score for communities on graph """ # NB if prune=True, graph must be undirected, and the prune setting takes precedence if prune: print("Setting directed=False because prune=True") directed = False if n_jobs == 1: kernel = jaccard_kernel else: kernel = parallel_jaccard_kernel kernelargs = {} data = data.astype(np.float32) if not data.flags.contiguous: data = np.ascontiguousarray(data) # faiss must use contiguous array and float32! # Start timer tic = time.time() # Go! if (d is not None) and (idx is not None): assert d.shape == idx.shape, "d and idx has different shapes!" assert idx.shape[0] == data.shape[0], "the number of rows of d is different with that of data!" assert d.shape[1] != k-1, "not k-1 nearest neighbors!" else: if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]: print("Using neighbor information from provided graph, rather than computing neighbors directly", flush=True) lilmatrix = data.tolil() d = np.vstack(lilmatrix.data).astype('float32') # distances idx = np.vstack(lilmatrix.rows).astype( 'int32') # neighbor indices by row del lilmatrix assert idx.shape[0] == data.shape[0] k = idx.shape[1] else: d, idx = find_neighbors( data, k=k, use_gpu=use_gpu, metric=primary_metric, method=nn_method, n_jobs=n_jobs) print("Neighbors computed in {} seconds".format( time.time() - tic), flush=True) subtic = time.time() kernelargs['idx'] = idx # if not using jaccard kernel, use gaussian if not jaccard: kernelargs['d'] = d kernelargs['sigma'] = 1. kernel = gaussian_kernel graph = neighbor_graph(kernel, kernelargs) print("Gaussian kernel graph constructed in {} seconds".format( time.time() - subtic), flush=True) else: del d graph = neighbor_graph(kernel, kernelargs) print("Jaccard graph constructed in {} seconds".format( time.time() - subtic), flush=True) if not directed: if not prune: # symmetrize graph by averaging with transpose sg = (graph + graph.transpose()).multiply(.5) else: # symmetrize graph by multiplying with transpose sg = graph.multiply(graph.transpose()) # retain lower triangle (for efficiency) graph = sp.tril(sg, -1) # write to file with unique id uid = uuid.uuid1().hex graph2binary(uid, graph) communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit) print("PhenoGraph complete in {} seconds".format( time.time() - tic), flush=True) communities = sort_by_size(communities, min_cluster_size) # clean up for f in os.listdir(): if re.search(uid, f): os.remove(f) return communities, graph, Q