def _cluster( data: Union[MuData, AnnData], resolution: Optional[Union[float, Sequence[float], Mapping[str, float]]] = None, mod_weights: Optional[Union[Sequence[float], Mapping[str, float]]] = None, random_state: int = 0, key_added: str = "louvain", neighbors_key: str = None, directed: bool = True, partition_type: Optional[ Union[Type[LeidenMutableVertexPartition], Type[LouvainMutableVertexPartition]] ] = None, partition_kwargs: Mapping[str, Any] = MappingProxyType({}), algorithm: str = "leiden", # Literal["leiden", "louvain"] **kwargs, ): """ Cluster cells using the Leiden or Louvain algorithm. See :func:`scanpy.tl.leiden` and :func:`scanpy.tl.louvain` for details. """ from scanpy.tools._utils import _choose_graph from scanpy._utils import get_igraph_from_adjacency if algorithm == "louvain": import louvain alg = louvain elif algorithm == "leiden": import leidenalg alg = leidenalg else: raise ValueError(f"Algorithms should be either 'louvain' or 'leiden', not '{algorithm}'") if isinstance(data, AnnData): sc_tl_cluster = sc.tl.leiden if algorithm == "leiden" else sc.tl.louvain return sc_tl_cluster( data, resolution=resolution, random_state=random_state, key_added=key_added, neighbors_key=neighbors_key, **kwargs, ) elif isinstance(data, MuData): mdata = data else: raise TypeError("Expected a MuData object") partition_kwargs = dict(partition_kwargs) gs = {} for mod in mdata.mod: adjacency = _choose_graph(mdata.mod[mod], None, neighbors_key) g = get_igraph_from_adjacency(adjacency, directed=directed) gs[mod] = g if mod_weights: if isinstance(mod_weights, Mapping): layer_weights = [mod_weights.get(mod, 1) for mod in mdata.mod] elif isinstance(mod_weights, Sequence) and not isinstance(mod_weights, str): assert len(mod_weights) == len( mdata.mod ), f"Length of layers_weights ({len(mod_weights)}) does not match the number of modalities ({len(mdata.mod)})" layer_weights = mod_weights else: layer_weights = [mod_weights for _ in mdata.mod] else: layer_weights = None if partition_type is None: partition_type = alg.RBConfigurationVertexPartition optimiser = alg.Optimiser() if random_state: optimiser.set_rng_seed(random_state) # The same as leiden.find_partition_multiplex() (louvain.find_partition_multiplex()) # but allows to specify resolution for each modality if resolution: if isinstance(resolution, Mapping): # Specific resolution for each modality parts = [ partition_type(gs[mod], resolution_parameter=resolution[mod], **partition_kwargs) for mod in mdata.mod ] elif isinstance(resolution, Sequence) and not isinstance(resolution, str): assert len(resolution) == len( mdata.mod ), f"Length of resolution ({len(resolution)}) does not match the number of modalities ({len(mdata.mod)})" parts = [ partition_type(gs[mod], resolution_parameter=resolution[i], **partition_kwargs) for i, mod in enumerate(mdata.mod) ] else: # Single resolution for all modalities parts = [ partition_type(gs[mod], resolution_parameter=resolution, **partition_kwargs) for mod in mdata.mod ] else: parts = [partition_type(gs[mod], **partition_kwargs) for mod in mdata.mod] improv = optimiser.optimise_partition_multiplex( partitions=parts, layer_weights=layer_weights, **kwargs, ) # All partitions are the same groups = np.array(parts[0].membership) mdata.obs[key_added] = pd.Categorical( values=groups.astype("U"), categories=natsorted(map(str, np.unique(groups))), ) mdata.uns[algorithm] = {} mdata.uns[algorithm]["params"] = dict( resolution=resolution, random_state=random_state, partition_improvement=improv, ) return None
def state_from_blocks( adata: AnnData, state_key: Optional[str] = 'nsbm', neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[spmatrix] = None, directed: bool = False, use_weights: bool = False, deg_corr: bool = True, ): """ Returns a gt state object given an AnnData Parameters ---------- adata The annotated data matrix. state_key The key under which the state has been saved neighbors_key The key passed to `sc.pp.neighbors` adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. Returns ------- Nothing, adds a `gt.block_state` object in adata.uns """ bl_d = adata.uns['schist'][f'{state_key}']['blocks'] params = adata.uns['schist'][f'{state_key}']['params'] if params['model'] == 'nested' or params['model'] == 'multiome_nested': blocks = [] for nl in range(len(bl_d)): blocks.append(bl_d[str(nl)]) else: blocks = bl_d['0'] if 'deg_corr' in params: deg_corr=params['deg_corr'] recs=[] rec_types=[] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs=[g.ep.weight] rec_types=['real-normal'] if 'recs' in params: recs=params['recs'] if 'rec_types' in params: rec_types=params['rec_types'] if adjacency is None: if neighbors_key not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) if params['model'] == 'flat': state = gt.BlockState(g, b=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) elif params['model'] == 'ppbm': state = gt.PPBlockState(g, b=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) else: state = gt.NestedBlockState(g, bs=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) return state
def louvain( adata: AnnData, resolution: Optional[float] = None, random_state: Optional[Union[int, RandomState]] = 0, log_fname: str = '', restrict_to: Optional[Tuple[str, Sequence[str]]] = None, key_added: Optional[str] = 'louvain', adjacency: Optional[spmatrix] = None, flavor: str = 'vtraag', directed: bool = True, use_weights: bool = False, partition_type: Optional[Type[MutableVertexPartition]] = None, partition_kwargs: Optional[Mapping[str, Any]] = None, copy: bool = False, ) -> Optional[AnnData]: """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first, or explicitly passing a ``adjacency`` matrix. Parameters ---------- adata The annotated data matrix. resolution For the default flavor (``'vtraag'``), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain ``(obs_key, list_of_categories)``. key_added Key under which to add the cluster labels. (default: ``'louvain'``) adjacency Sparse adjacency matrix of the graph, defaults to ``adata.uns['neighbors']['connectivities']``. flavor : {``'vtraag'``, ``'igraph'``} Choose between to packages for computing the clustering. ``'vtraag'`` is much more powerful, and the default. directed Interpret the ``adjacency`` matrix as directed graph? use_weights Use weights from knn graph. partition_type Type of partition to use. Only a valid argument if ``flavor`` is ``'vtraag'``. partition_kwargs Key word arguments to pass to partitioning, if ``vtraag`` method is being used. copy Copy adata or modify it inplace. Returns ------- :obj:`None` By default (``copy=False``), updates ``adata`` with the following fields: ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``) Array of dim (number of samples) that stores the subgroup id (``'0'``, ``'1'``, ...) for each cell. :class:`~anndata.AnnData` When ``copy=True`` is set, a copy of ``adata`` with those fields is returned. """ start = logg.info('running Louvain clustering') if (flavor != 'vtraag') and (partition_type is not None): raise ValueError( '`partition_type` is only a valid argument when `flavour` is "vtraag"' ) adata = adata.copy() if copy else adata if adjacency is None and 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first to compute a neighborhood graph.' ) if adjacency is None: adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency) if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warning( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.debug(' using the undirected graph') g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if use_weights: weights = np.array(g.es["weight"]).astype(np.float64) else: weights = None if flavor == 'vtraag': import louvain if partition_kwargs is None: partition_kwargs = {} if partition_type is None: partition_type = louvain.RBConfigurationVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, partition_type, log_fname=log_fname, **partition_kwargs, ) # adata.uns['louvain_quality'] = part.quality() else: part = g.community_multilevel(weights=weights) groups = np.array(part.membership) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') if restrict_to is not None: if key_added == 'louvain': key_added += '_R' groups = rename_groups(adata, key_added, restrict_key, restrict_categories, restrict_indices, groups) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(np.unique(groups).astype('U')), ) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = { 'resolution': resolution, 'random_state': random_state } logg.info( ' finished', time=start, #deep=( # f'found {len(np.unique(groups))} clusters and added\n' # f' {key_added!r}, the cluster labels (adata.obs, categorical)' #), ) return adata if copy else None
def planted_model( adata: AnnData, n_sweep: int = 10, beta: float = np.inf, tolerance=1e-6, collect_marginals: bool = True, deg_corr: bool = True, samples: int = 100, n_jobs: int = -1, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_seed: Optional[int] = None, key_added: str = 'ppbm', adjacency: Optional[sparse.spmatrix] = None, neighbors_key: Optional[str] = 'neighbors', directed: bool = False, use_weights: bool = False, copy: bool = False, save_model: Union[str, None] = None, # minimize_args: Optional[Dict] = {}, dispatch_backend: Optional[str] = 'processes', ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Peixoto14]_. Cluster cells using the Planted Partition Block Model [Peixoto14]_, performing Bayesian inference on node groups. This function, in particular, uses the Planted Block Model, which is particularly suitable in case of assortative graphs and it returns the optimal number of communities This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. n_sweep Number of MCMC sweeps to get the initial guess beta Inverse temperature for the initial MCMC sweep tolerance Difference in description length to stop MCMC sweep iterations collect_marginals Whether or not collect node probability of belonging to a specific partition. deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. samples Number of initial minimizations to be performed. This influences also the precision for marginals key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 neighbors_key The key passed to `sc.pp.neighbors` directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times copy Whether to copy `adata` or modify it inplace. save_model If provided, this will be the filename for the PartitionModeState to be saved random_seed Random number to be used as seed for graph-tool n_jobs Number of parallel computations used during model initialization Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['schist']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. `adata.uns['schist']['stats']` A dict with the values returned by mcmc_sweep `adata.obsm['CM_ppbm']` A `np.ndarray` with cell probability of belonging to a specific group `adata.uns['schist']['state']` The BlockModel state object """ if random_seed: np.random.seed(random_seed) seeds = np.random.choice(range(samples**2), size=samples, replace=False) if collect_marginals and samples < 100: logg.warning( 'Collecting marginals requires sufficient number of samples\n' f'It is now set to {samples} and should be at least 100') start = logg.info('minimizing the Planted Partition Block Model') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if neighbors_key not in adata.uns: raise ValueError('You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.') elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph and graph-tool g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) recs = [] rec_types = [] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs = [g.ep.weight] rec_types = ['real-normal'] if samples < 1: samples = 1 # initialize the block states def fast_min(state, beta, n_sweep, fast_tol, seed=None): if seed: gt.seed_rng(seed) dS = 1 while np.abs(dS) > fast_tol: dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep) return state states = [gt.PPBlockState(g) for x in range(samples)] # perform a mcmc sweep on each # no list comprehension as I need to collect stats states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)( delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x]) for x in range(samples)) logg.info(' minimization step done', time=start) pmode = gt.PartitionModeState([x.get_blocks().a for x in states], converge=True) bs = pmode.get_max(g) logg.info(' consensus step done', time=start) if save_model: import pickle fname = save_model if not fname.endswith('pkl'): fname = f'{fname}.pkl' logg.info(f'Saving model into {fname}') with open(fname, 'wb') as fout: pickle.dump(pmode, fout, 2) state = gt.PPBlockState(g, b=bs) logg.info(' done', time=start) groups = np.array(bs.get_array()) u_groups = np.unique(groups) n_groups = len(u_groups) last_group = np.max(u_groups) + 1 if collect_marginals: pv_array = pmode.get_marginal(g).get_2d_array( range(last_group)).T[:, u_groups] / samples rosetta = dict(zip(u_groups, range(len(u_groups)))) groups = np.array([rosetta[x] for x in groups]) if restrict_to is not None: if key_added == 'ppbm': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) # add column names adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(map(str, np.unique(groups))), ) # now add marginal probabilities. if collect_marginals: # cell marginals will be a list of arrays with probabilities # of belonging to a specific group adata.obsm[f"CM_{key_added}"] = pv_array # add some unstructured info if not 'schist' in adata.uns: adata.uns['schist'] = {} adata.uns['schist'][f'{key_added}'] = {} adata.uns['schist'][f'{key_added}']['stats'] = dict( entropy=state.entropy(), modularity=gt.modularity(g, state.get_blocks())) # record state as list of blocks # for compatibility with nested model, use a dictionary with a single key here # although a np.array would be ok adata.uns['schist'][f'{key_added}']['blocks'] = { '0': np.array(state.get_blocks().a) } # last step is recording some parameters used in this analysis adata.uns['schist'][f'{key_added}']['params'] = dict( model='planted', use_weights=use_weights, neighbors_key=neighbors_key, key_added=key_added, samples=samples, collect_marginals=collect_marginals, random_seed=random_seed, deg_corr=deg_corr, recs=recs, rec_types=rec_types) logg.info( ' finished', time=start, deep=( f'found {state.get_B()} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adata if copy else None
def to_igraph(self): """Generate igraph from connectiviies. """ return _utils.get_igraph_from_adjacency(self.connectivities)
def calculate_affinity(adata: AnnData, level: int = 1, block_key: Optional[str] = 'nsbm', group_by: Optional[str] = None, state: Optional = None, neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[sparse.spmatrix] = None, directed: bool = False, use_weights: bool = False, obsp: Optional[str] = None, back_prob: bool = False, copy: bool = False) -> Optional[AnnData]: """\ Calculate cell affinity given a partition scheme. It can be used for partitions calculated using schist or for any partition scheme, given for example by cell annotations. Parameters ---------- adata: The AnnData object. Should have been already processed with schist level: The level to calculate affinity. This parameter is effective only for Nested partitions block_key: The prefix for partitions. This parameter is ignored if the state is not gt.NestedBlockState group_by: The key for group names used for calculations. Setting this will override level and block_key. This is effective only for NestedBlockState partitions state: Optionally calculate affinities on this state. neighbors_key Use neighbors connectivities as adjacency. If not specified, leiden looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, leiden looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). copy: Return a new object or do everything in place Returns ------- Depending on `copy`, returns or updates `adata` with affinity values in adata.obsm[f'CA_{block_key}_level_{level}'] """ matrix_key = f'CA_{block_key}_level_{level}' # the default name of the matrix if group_by: logg.info(f'Calculating cell affinity to {group_by}') else: logg.info(f'Calculating cell affinity to level {level}') if not state: # if no state is provided, use the default to retrieve graph if 'schist' in adata.uns and 'blocks' in adata.uns['schist'][ f'{block_key}']: params = adata.uns['schist'][f'{block_key}']['params'] if 'neighbors_key' in params: neighbors_key = params['neighbors_key'] if 'use_weights' in params: use_weights = params['use_weights'] if 'deg_corr' in params: deg_corr = params['deg_corr'] state = state_from_blocks(adata, state_key=block_key, neighbors_key=neighbors_key, adjacency=adjacency, directed=directed, use_weights=use_weights, deg_corr=deg_corr) g = state.g elif not neighbors_key: # no state and no adjacency provided, raise an error raise ValueError("A state or an adjacency matrix should be given" "Otherwise a graph cannot be computed") else: # get the graph from the adjacency adjacency = _choose_graph(adata, obsp, neighbors_key) g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) state = gt.BlockState(g) else: g = state.g if group_by: matrix_key = f'CA_{group_by}' # if groups are given, we generate a new BlockState and work on that if group_by in adata.obs.columns and adata.obs[ group_by].dtype.name == 'category': partitions = adata.obs[group_by].cat.codes.values state = gt.BlockState(g, b=partitions) if back_prob: ca_matrix = get_cell_back_p(state) else: ca_matrix = get_cell_loglikelihood(state, as_prob=True) else: raise ValueError( f"{group_by} should be a categorical entry in adata.obs") else: # use precomputed blocks and states if type(state) == gt.NestedBlockState: if back_prob: p0 = get_cell_back_p(state, level=0) else: p0 = get_cell_loglikelihood(state, level=0, as_prob=True) group_col = None if group_by and group_by in adata.obs.columns: group_col = group_by else: g_name = f'{block_key}_level_{level}' if g_name in adata.obs.columns: group_col = g_name if not group_col: raise ValueError( "The provided groups or level/blocks do not exist") g0 = pd.Categorical(state.project_partition(0, 0).a) cross_tab = pd.crosstab(g0, adata.obs[group_col], normalize='index') ca_matrix = (p0 @ cross_tab).values elif type(state) == gt.PPBlockState: if back_prob: ca_matrix = get_cell_back_p(state) else: ca_matrix = get_cell_loglikelihood(state, as_prob=True) matrix_key = 'CA_ppbm' adata.obsm[matrix_key] = ca_matrix return adata if copy else None
def nested_model_multi( adatas: List[AnnData], deg_corr: bool = True, tolerance: float = 1e-6, n_sweep: int = 10, beta: float = np.inf, samples: int = 100, collect_marginals: bool = True, n_jobs: int = -1, *, random_seed: Optional[int] = None, key_added: str = 'multi_nsbm', adjacency: Optional[List[sparse.spmatrix]] = None, neighbors_key: Optional[List[str]] = ['neighbors'], directed: bool = False, use_weights: bool = False, save_model: Union[str, None] = None, copy: bool = False, # minimize_args: Optional[Dict] = {}, dispatch_backend: Optional[str] = 'processes', # equilibrate_args: Optional[Dict] = {}, ) -> Optional[List[AnnData]]: """\ Cluster cells into subgroups using multiple modalities. Cluster cells using the nested Stochastic Block Model [Peixoto14]_, performing Bayesian inference on node groups. This function takes multiple experiments, possibly across different modalities, and perform joint clustering. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. It also requires cells having the same names if coming from paired experiments Parameters ---------- adatas A list of processed AnnData. Neighbors must have been already calculated. deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. tolerance Tolerance for fast model convergence. n_sweep Number of iterations to be performed in the fast model MCMC greedy approach beta Inverse temperature for MCMC greedy approach samples Number of initial minimizations to be performed. The one with smaller entropy is chosen n_jobs Number of parallel computations used during model initialization key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 neighbors_key The key passed to `sc.pp.neighbors`. If all AnnData share the same key, one only has to be specified, otherwise the full tuple of all keys must be provided directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times save_model If provided, this will be the filename for the PartitionModeState to be saved copy Whether to copy `adata` or modify it inplace. random_seed Random number to be used as seed for graph-tool Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['schist']['multi_level_params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. `adata.uns['schist']['multi_level_stats']` A dict with the values returned by mcmc_sweep `adata.obsm['CA_multi_nsbm_level_{n}']` A `np.ndarray` with cell probability of belonging to a specific group `adata.uns['schist']['multi_level_state']` The NestedBlockModel state object """ if random_seed: np.random.seed(random_seed) seeds = np.random.choice(range(samples**2), size=samples, replace=False) if collect_marginals and samples < 100: logg.warning( 'Collecting marginals requires sufficient number of samples\n' f'It is now set to {samples} and should be at least 100') start = logg.info('minimizing the nested Stochastic Block Model') if copy: adatas = [x.copy() for x in adatas] n_keys = len(neighbors_key) n_data = len(adatas) # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: adjacency = [] if n_keys > 1 and n_keys < n_data: raise ValueError( 'The number of neighbors keys does not match' 'the number of data matrices. Either fix this' 'or pass a neighbor key that is shared across all modalities') if n_keys == 1: neighbors_key = [neighbors_key[0] for x in range(n_data)] for x in range(n_data): logg.info(f'getting adjacency for data {x}', time=start) if neighbors_key[x] not in adatas[x].uns: raise ValueError('You need to run `pp.neighbors` first ' 'to compute a neighborhood graph. for' f'data entry {x}') elif 'connectivities_key' in adatas[x].uns[neighbors_key[x]]: # scanpy>1.4.6 has matrix in another slot conn_key = adatas[x].uns[ neighbors_key[x]]['connectivities_key'] adjacency.append(adatas[x].obsp[conn_key]) else: # scanpy<=1.4.6 has sparse matrix here adjacency.append( adatas[x].uns[neighbors_key[x]]['connectivities']) # convert it to igraph and graph-tool graph_list = [] for x in range(n_data): g = get_igraph_from_adjacency(adjacency[x], directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) # add cell names to graph, this will be used to create # layered graph g_names = g.new_vertex_property('string') d_names = adatas[x].obs_names for xn in range(len(d_names)): g_names[xn] = d_names[xn] g.vp['cell'] = g_names graph_list.append(g) # skip weights for now # recs=[] # rec_types=[] # if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. # recs=[g.ep.weight] # rec_types=['real-normal'] # get a non-redundant list of all cell names across all modalities all_names = set(adatas[0].obs_names) [all_names.update(adatas[x].obs_names) for x in range(1, n_data)] all_names = list(all_names) # create the shared graph union_g = gt.Graph(directed=False) union_g.add_vertex(len(all_names)) u_names = union_g.new_vertex_property('string') for xn in range(len(all_names)): u_names[xn] = all_names[xn] union_g.vp['cell'] = u_names # now handle in a non elegant way the index mapping across all # modalities and the unified Graph u_cell_index = dict([(union_g.vp['cell'][x], x) for x in range(union_g.num_vertices())]) # now create layers layer = union_g.new_edge_property('int') for ng in range(n_data): for e in graph_list[ng].edges(): S, T = e.source(), e.target() Sn = graph_list[ng].vp['cell'][S] Tn = graph_list[ng].vp['cell'][T] Sidx = u_cell_index[Sn] Tidx = u_cell_index[Tn] ne = union_g.add_edge(Sidx, Tidx) layer[ne] = ng + 1 # this is the layer label union_g.ep['layer'] = layer # DONE! now proceed with standard minimization, ish if samples < 1: samples = 1 states = [ gt.NestedBlockState(g=union_g, base_type=gt.LayeredBlockState, state_args=dict(deg_corr=deg_corr, ec=union_g.ep.layer, layers=True)) for n in range(samples) ] def fast_min(state, beta, n_sweep, fast_tol, seed=None): if seed: gt.seed_rng(seed) dS = 1 while np.abs(dS) > fast_tol: dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep, c=0.5) return state states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)( delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x]) for x in range(samples)) logg.info(' minimization step done', time=start) pmode = gt.PartitionModeState([x.get_bs() for x in states], converge=True, nested=True) bs = pmode.get_max_nested() logg.info(' consensus step done', time=start) if save_model: import pickle fname = save_model if not fname.endswith('pkl'): fname = f'{fname}.pkl' logg.info(f'Saving model into {fname}') with open(fname, 'wb') as fout: pickle.dump(pmode, fout, 2) # prune redundant levels at the top bs = [x for x in bs if len(np.unique(x)) > 1] bs.append(np.array([0], dtype=np.int32)) #in case of type changes, check this state = gt.NestedBlockState(union_g, bs=bs, base_type=gt.LayeredBlockState, state_args=dict(deg_corr=deg_corr, ec=union_g.ep.layer, layers=True)) logg.info(' done', time=start) u_groups = np.unique(bs[0]) n_groups = len(u_groups) last_group = np.max(u_groups) + 1 if collect_marginals: # note that the size of this will be equal to the number of the groups in Mode # but some entries won't sum to 1 as in the collection there may be differently # sized partitions pv_array = pmode.get_marginal(union_g).get_2d_array( range(last_group)).T[:, u_groups] / samples groups = np.zeros((union_g.num_vertices(), len(bs)), dtype=int) for x in range(len(bs)): # for each level, project labels to the vertex level # so that every cell has a name. Note that at this level # the labels are not necessarily consecutive groups[:, x] = state.project_partition(x, 0).get_array() groups = pd.DataFrame(groups).astype('category') # rename categories from 0 to n for c in groups.columns: ncat = len(groups[c].cat.categories) new_cat = [u'%s' % x for x in range(ncat)] groups[c].cat.rename_categories(new_cat, inplace=True) levels = groups.columns # recode block names to have consistency with group names i_groups = groups.astype(int) bs = [i_groups.iloc[:, 0].values] for x in range(1, groups.shape[1]): bs.append( np.where( pd.crosstab(i_groups.iloc[:, x - 1], i_groups.iloc[:, x]) > 0)[1]) state = gt.NestedBlockState(union_g, bs) del (i_groups) groups.index = all_names # add column names groups.columns = [f"{key_added}_level_{level}" for level in range(len(bs))] # remove any column with the same key for xn in range(n_data): drop_columns = groups.columns.intersection(adatas[xn].obs.columns) adatas[xn].obs.drop(drop_columns, 'columns', inplace=True) adatas[xn].obs = pd.concat( [adatas[xn].obs, groups.loc[adatas[xn].obs_names]], axis=1) # now add marginal probabilities. if collect_marginals: # add marginals for level 0, the sum up according to the hierarchy _groups = groups.loc[adatas[xn].obs_names] _pv_array = pd.DataFrame( pv_array, index=all_names).loc[adatas[xn].obs_names].values adatas[xn].obsm[f"CM_{key_added}_level_0"] = _pv_array for group in groups.columns[1:]: ct = pd.crosstab(_groups[_groups.columns[0]], _groups[group], normalize='index', dropna=False) adatas[xn].obsm[f'CM_{group}'] = _pv_array @ ct.values # add some unstructured info if not 'schist' in adatas[xn].uns: adatas[xn].uns['schist'] = {} adatas[xn].uns['schist'][f'{key_added}'] = {} adatas[xn].uns['schist'][f'{key_added}']['stats'] = dict( level_entropy=np.array( [state.level_entropy(x) for x in range(len(state.levels))]), modularity=np.array([ gt.modularity(union_g, state.project_partition(x, 0)) for x in range(len((state.levels))) ])) bl_d = {} levels = state.get_levels() for nl in range(len(levels)): bl_d[str(nl)] = np.array(levels[nl].get_blocks().a) adatas[xn].uns['schist'][f'{key_added}']['blocks'] = bl_d # last step is recording some parameters used in this analysis adatas[xn].uns['schist'][f'{key_added}']['params'] = dict( model='multiome_nested', use_weights=use_weights, neighbors_key=neighbors_key[xn], key_added=key_added, samples=samples, collect_marginals=collect_marginals, random_seed=random_seed, deg_corr=deg_corr, # recs=recs, # rec_types=rec_types ) logg.info( ' finished', time=start, deep=( f'and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adatas if copy else None
def leiden( adata: AnnData, resolution: float = 1, samples: int = 100, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_state: _utils.AnyRandom = 0, key_added: str = 'leiden', adjacency: Optional[sparse.spmatrix] = None, directed: bool = True, use_weights: bool = True, n_iterations: int = -1, partition_type: Optional[Type[MutableVertexPartition]] = None, neighbors_key: Optional[str] = None, obsp: Optional[str] = None, collect_marginals: bool = True, n_jobs: int = -1, copy: bool = False, save_model: Union[str, None] = None, dispatch_backend: Optional[str] = 'processes', **partition_kwargs, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Traag18]_. Cluster cells using the Leiden algorithm [Traag18]_, an improved version of the Louvain algorithm [Blondel08]_. It has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. resolution A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. samples samples The number of random samples to take for consensus random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain `(obs_key, list_of_categories)`. key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). n_iterations How many iterations of the Leiden clustering algorithm to perform. Positive values above 2 define the total number of iterations to perform, -1 has the algorithm run until it reaches its optimal clustering. partition_type Type of partition to use. Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. neighbors_key Use neighbors connectivities as adjacency. If not specified, leiden looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, leiden looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. obsp Use .obsp[obsp] as adjacency. You can't specify both `obsp` and `neighbors_key` at the same time. collect_marginals Wheter to retrieve the marginal probability to belong to a group n_jobs Number of parallel jobs to calculate partitions copy Whether to copy `adata` or modify it inplace. save_model If provided, this will be the filename for the PartitionModeState to be saved **partition_kwargs Any further arguments to pass to `~leidenalg.find_partition` (which in turn passes arguments to the `partition_type`). Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['leiden']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. """ try: import leidenalg except ImportError: raise ImportError( 'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.' ) partition_kwargs = dict(partition_kwargs) start = logg.info('running Leiden clustering') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: adjacency = _choose_graph(adata, obsp, neighbors_key) if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = get_igraph_from_adjacency(adjacency, directed=directed) g_gt = g.to_graph_tool() gt.remove_parallel_edges(g_gt) # flip to the default partition type if not overriden by the user if partition_type is None: partition_type = leidenalg.RBConfigurationVertexPartition # Prepare find_partition arguments as a dictionary, # appending to whatever the user provided. It needs to be this way # as this allows for the accounting of a None resolution # (in the case of a partition variant that doesn't take it on input) if use_weights: partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64) partition_kwargs['n_iterations'] = n_iterations np.random.seed(random_state) seeds = np.random.choice(range(0, samples**2), size=samples, replace=False) if resolution is not None: partition_kwargs['resolution_parameter'] = resolution # clustering proper def membership(g, partition_type, seed, **partition_kwargs): return leidenalg.find_partition(g, partition_type, seed=seed, **partition_kwargs).membership parts = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)( delayed(membership)(g, partition_type, seeds[i], **partition_kwargs) for i in range(samples)) pmode = gt.PartitionModeState(parts, converge=True) if save_model: import pickle fname = save_model if not fname.endswith('pkl'): fname = f'{fname}.pkl' logg.info(f'Saving model into {fname}') with open(fname, 'wb') as fout: pickle.dump(pmode, fout, 2) groups = np.array(pmode.get_max(g_gt).get_array()) u_groups = np.unique(groups) n_groups = len(u_groups) last_group = np.max(u_groups) + 1 if collect_marginals: pv_array = pmode.get_marginal(g_gt).get_2d_array(range(last_group)).T[:, u_groups] / samples # rename groups to ensure they are a continuous range rosetta = dict(zip(u_groups, range(len(u_groups)))) groups = np.array([rosetta[x] for x in groups]) # store output into adata.obs if restrict_to is not None: if key_added == 'leiden': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(map(str, np.unique(groups))), ) if collect_marginals: adata.obsm[f"CM_{key_added}"] = pv_array # store information on the clustering parameters adata.uns['leiden'] = {} adata.uns['leiden']['params'] = dict( resolution=resolution, random_state=random_state, n_iterations=n_iterations, samples=samples, collect_marginals=collect_marginals ) logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)' ), ) return adata if copy else None