def consensus_clustering(datapath, stepsize, out_dir): graph, initial_partition = read_data(datapath) cons_modularities = [] cons_memberships = [] t_range = np.arange(0, 1, stepsize) for t in t_range: print('threshold: {}'.format(t)) cm, consensus_membership = nwtools.consensus.consensus_partition(graph, weights='weight', initial_partition=initial_partition, nr_partitions=len( initial_partition), threshold=t, verbose=True) cons_memberships.append(consensus_membership) cons_modularities.append(leidenalg.ModularityVertexPartition(graph, initial_membership=consensus_membership, weights='weight').quality()) df_cons_memberships = pd.DataFrame(cons_memberships).transpose() df_cons_memberships.columns = ['{:.3f}'.format(t) for t in t_range] df_cons_memberships.index = graph.vs['name'] df_cons_memberships.to_csv( os.path.join(datapath, 'consensus_thresholds.csv')) df_modularities = pd.DataFrame({'threshold': t_range, 'modularity': cons_modularities}) df_modularities.to_csv(os.path.join(datapath, 'thresholds_modularity.csv'), index=False)
def fit(self): '''Compute communities from a matrix with fixed nodes Returns: None, but the membership attribute is set as an array of int with size N - n_fixed with the community/cluster membership of all columns except the first n fixed ones. ''' self._parse_graph() aa = self.annotations n_fixed = len(aa) g = self.graph N = g.vcount() opt = leidenalg.Optimiser() fixed_nodes = [int(i < n_fixed) for i in range(N)] # NOTE: initial membership is singletons except for atlas nodes, which # get the membership they have. aau = list(np.unique(aa)) aaun = len(aau) initial_membership = [] for j in range(N): if j < n_fixed: mb = aau.index(aa[j]) else: mb = aaun + (j - n_fixed) initial_membership.append(mb) if self.metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=self.resolution_parameter, initial_membership=initial_membership, ) elif self.metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=self.resolution_parameter, initial_membership=initial_membership, ) else: raise ValueError('clustering_metric not understood: {:}'.format( self.metric)) # Run modified Leiden here opt.optimise_partition(partition, fixed_nodes=fixed_nodes) # Exctract result membership = partition.membership[n_fixed:] # Convert the known cell types lstring = len(max(aau, key=len)) self.membership = np.array([str(x) for x in membership], dtype='U{:}'.format(lstring)) for i, ct in enumerate(aau): self.membership[self.membership == str(i)] = ct
def read_data(datapath): partitions_df = pd.read_csv(os.path.join(datapath, 'partitions.csv'), na_filter=False) graph = ig.Graph.Read_Pickle(os.path.join(datapath, 'graph.pkl')) initial_partition = [leidenalg.ModularityVertexPartition(graph, initial_membership=row.values, weights='weight') for i, row in partitions_df.iterrows()] return graph, initial_partition
def community_consensus_iterative(self, C): ## function finding the consensus of a given set of partitions. refer to the paper: ## 'Robust detection of dynamic community structure in networks', Danielle S. Bassett, ## Mason A. Porter, Nicholas F. Wymbs, Scott T. Grafton, Jean M. Carlson et al. npart, m = C.shape C_rand3 = np.zeros((C.shape)) #permuted version of C X = np.zeros((m, m)) #Nodal association matrix for C X_rand3 = X # Random nodal association matrix for C_rand3 # randomly permute rows of C for i in range(npart): C_rand3[i, :] = C[i, np.random.permutation(m)] for k in range(m): for p in range(m): if int(C[i, k]) == int(C[i, p]): X[p, k] = X[ p, k] + 1 #(i,j) is the # of times node i and j are assigned in the same comm if int(C_rand3[i, k]) == int(C_rand3[i, p]): X_rand3[p, k] = X_rand3[ p, k] + 1 #(i,j) is the # of times node i and j are expected to be assigned in the same comm by chance #thresholding #keep only associated assignments that occur more often than expected in the random data X_new3 = np.zeros((m, m)) X_new3[X > (np.max(np.triu(X_rand3, 1))) / 2] = X[X > (np.max(np.triu(X_rand3, 1))) / 2] ##turn thresholded nodal association matrix into igraph edge_list = [] weight_list = [] for k, e in enumerate(np.transpose(np.nonzero(X_new3))): i, j = e[0], e[1] pair = (i, j) edge_list.append(pair) weight_list.append(X_new3[i][j]) G = ig.Graph() G.add_vertices(m) G.add_edges(edge_list) G.es['weight'] = weight_list G.vs['id'] = list(range(m)) optimiser = la.Optimiser() partition = la.ModularityVertexPartition(G, weights='weight') diff = optimiser.optimise_partition(partition, n_iterations=-1) return (partition)
def compute_communities(self): '''Compute communities from a matrix with fixed nodes Returns: None, but SemiAnnotate.membership is set as an array of int with size N - n_fixed with the community/cluster membership of all columns except the first n_fixed ones. ''' import inspect import igraph as ig import leidenalg # Check whether this version of Leiden has fixed nodes support opt = leidenalg.Optimiser() sig = inspect.getfullargspec(opt.optimise_partition) if 'fixed_nodes' not in sig.args: raise ImportError( 'This version of the leidenalg module does not support fixed nodes. Please update to a later (development) version' ) matrix = self.matrix aa = self.cell_types aau = np.unique(aa) n_fixed = self.n_fixed clustering_metric = self.clustering_metric resolution_parameter = self.resolution_parameter neighbors = self.neighbors L, N = matrix.shape # Construct graph from the lists of neighbors edges_d = set() for i, neis in enumerate(neighbors): for n in neis: edges_d.add(frozenset((i, n))) edges = [tuple(e) for e in edges_d] g = ig.Graph(n=N, edges=edges, directed=False) # NOTE: initial membership is singletons except for atlas nodes, which # get the membership they have. aaun = len(aau) initial_membership = [] for j in range(N): if j < self.n_fixed: mb = aau.index(aa[j]) else: mb = aaun + (j - n_fixed) initial_membership.append(mb) # Compute communities with semi-supervised Leiden if clustering_metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) elif clustering_metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) else: raise ValueError('clustering_metric not understood: {:}'.format( clustering_metric)) fixed_nodes = [int(i < n_fixed) for i in range(N)] opt.optimise_partition(partition, fixed_nodes=fixed_nodes) membership = partition.membership[n_fixed:] # Convert the known cell types lstring = len(max(self.cell_types, key=len)) self.membership = np.array([str(x) for x in membership], dtype='U{:}'.format(lstring)) for i, ct in enumerate(self.cell_types): self.membership[self.membership == str(i)] = ct
def leiden( self, axis, edges, edge_weights=None, metric='cpm', resolution_parameter=0.001, initial_membership=None, fixed_nodes=None, ): '''Graph-based Leiden clustering Args: axis (string): It must be 'samples' or 'features'. The Dataset.counts matrix is used and either samples or features are clustered. edges (list of pairs): list of edges to make a graph used to cluster. Each member of a pair is an int referring to the index of the sample or feature in the sample/featuresheet. edge_weights (list of float or None): edge weights to use for clustering. If None, all edge weights are 1. metric (str): What metric to optimize. Can be 'modularity' or 'cpm'. resolution_parameter (float): a number between 0 and 1 that sets how easy it is to call new clusters. initial_membership (str or None): name of a metadata column containing the initial membership vector for the clustering. If None (default), each samples starts as a singleton fixed_nodes (str or None): name of a metadata column containing a boolean vector for which nodes are not allowed to change cluster membership during the Leiden algorithm. Your version of leidenalg must support fixed nodes for this feature to work. Returns: pd.Series with the labels of the clusters. ''' import igraph as ig import leidenalg if axis == 'samples': n_nodes = self.dataset.n_samples index = self.dataset.samplenames elif axis == 'features': n_nodes = self.dataset.n_features index = self.dataset.featurenames g = ig.Graph(n=n_nodes, edges=edges, directed=False) if edge_weights is not None: g.es['weight'] = edge_weights if initial_membership is not None: if axis == 'samples': im = self.dataset.samplesheet[ initial_membership].values.astype(int) else: im = self.dataset.featuresheet[ initial_membership].values.astype(int) else: im = np.arange(n_nodes) im = list(im) if metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=im, ) elif metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=im, ) else: raise ValueError( 'clustering_metric not understood: {:}'.format(metric)) opt = leidenalg.Optimiser() if fixed_nodes is not None: if axis == 'samples': fxn = self.dataset.samplesheet[fixed_nodes].values.astype(int) else: fxn = self.dataset.featuresheet[fixed_nodes].values.astype(int) fxn = list(fxn) opt.optimise_partition(partition, fixed_nodes=fxn) else: opt.optimise_partition(partition) communities = partition.membership labels = pd.Series(communities, index=index) return labels
def cluster_graph(self): '''Compute communities from a matrix with fixed nodes Returns: None, but Averages.membership is set as an array with size N - n_fixed with the atlas cell types of all cells from the new dataset. ''' import inspect import leidenalg # Check whether this version of Leiden has fixed nodes support opt = leidenalg.Optimiser() sig = inspect.getfullargspec(opt.optimise_partition) if 'fixed_nodes' not in sig.args: raise ImportError('This version of the leidenalg module does not support fixed nodes. Please update to a later (development) version') matrix = self.matrix sizes = self.sizes n_fixed = self.n_fixed clustering_metric = self.clustering_metric resolution_parameter = self.resolution_parameter g = self.graph L, N = matrix.shape n_fixede = int(np.sum(sizes[:n_fixed])) Ne = int(np.sum(sizes)) # NOTE: initial membership is singletons except for atlas nodes, which # get the membership they have. initial_membership = [] for isi in range(N): if isi < n_fixed: for ii in range(int(self.sizes[isi])): initial_membership.append(isi) else: initial_membership.append(isi) if len(initial_membership) != Ne: raise ValueError('initial_membership list has wrong length!') # Compute communities with semi-supervised Leiden if clustering_metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) elif clustering_metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) else: raise ValueError( 'clustering_metric not understood: {:}'.format(clustering_metric)) fixed_nodes = [int(i < n_fixede) for i in range(Ne)] opt.optimise_partition(partition, fixed_nodes=fixed_nodes) membership = partition.membership[n_fixede:] # Convert the known cell types lstring = len(max(self.cell_types, key=len)) self.membership = np.array( [str(x) for x in membership], dtype='U{:}'.format(lstring)) for i, ct in enumerate(self.cell_types): self.membership[self.membership == str(i)] = ct
def leiden( adata: AnnData, resolution: float = 1, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_state: Optional[Union[int, RandomState]] = 0, key_added: str = 'leiden', adjacency: Optional[sparse.spmatrix] = None, directed: bool = True, use_weights: bool = True, n_iterations: int = -1, partition_type: Optional[Type[MutableVertexPartition]] = None, copy: bool = False, **partition_kwargs, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Traag18]_. Cluster cells using the Leiden algorithm [Traag18]_, an improved version of the Louvain algorithm [Blondel08]_. It has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. resolution A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain `(obs_key, list_of_categories)`. key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']`. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). n_iterations How many iterations of the Leiden clustering algorithm to perform. Positive values above 2 define the total number of iterations to perform, -1 has the algorithm run until it reaches its optimal clustering. partition_type Type of partition to use. Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. copy Whether to copy `adata` or modify it inplace. **partition_kwargs Any further arguments to pass to `~leidenalg.find_partition` (which in turn passes arguments to the `partition_type`). Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['leiden']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. """ try: import leidenalg except ImportError: raise ImportError( 'Please install the leiden algorithm: `pip3 install leidenalg`.' ) partition_kwargs = dict(partition_kwargs) start = logg.info('running Leiden clustering') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) # flip to the default partition type if not overriden by the user if partition_type is None: partition_type = leidenalg.RBConfigurationVertexPartition # Prepare find_partition arguments as a dictionary, # appending to whatever the user provided. It needs to be this way # as this allows for the accounting of a None resolution # (in the case of a partition variant that doesn't take it on input) if use_weights: partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64) partition_kwargs['n_iterations'] = n_iterations partition_kwargs['seed'] = random_state if resolution is not None: partition_kwargs['resolution_parameter'] = resolution # clustering proper part = leidenalg.find_partition(g, partition_type, **partition_kwargs) # store output into adata.obs groups = np.array(part.membership) if restrict_to is not None: if key_added == 'leiden': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(np.unique(groups).astype('U')), ) # store information on the clustering parameters adata.uns[key_added] = {} adata.uns[key_added]['params'] = dict( resolution=resolution, random_state=random_state, n_iterations=n_iterations, use_weights=use_weights, directed=directed, partition_type=None if partition_type is None else partition_type.__name__, ) # calculate modularity modularity_part = leidenalg.ModularityVertexPartition( g, initial_membership=part.membership, ) q = modularity_part.quality() adata.uns[key_added]['modularity'] = q logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)\n' f' modularity: {q:.3f}, resolution: {resolution}\n' f' added "modularity" key to adata.uns["{key_added}"]' ), ) return adata if copy else None