Example #1
0
File: tools.py Project: gtca/muon
def _cluster(
    data: Union[MuData, AnnData],
    resolution: Optional[Union[float, Sequence[float], Mapping[str, float]]] = None,
    mod_weights: Optional[Union[Sequence[float], Mapping[str, float]]] = None,
    random_state: int = 0,
    key_added: str = "louvain",
    neighbors_key: str = None,
    directed: bool = True,
    partition_type: Optional[
        Union[Type[LeidenMutableVertexPartition], Type[LouvainMutableVertexPartition]]
    ] = None,
    partition_kwargs: Mapping[str, Any] = MappingProxyType({}),
    algorithm: str = "leiden",  # Literal["leiden", "louvain"]
    **kwargs,
):
    """
    Cluster cells using the Leiden or Louvain algorithm.

    See :func:`scanpy.tl.leiden` and :func:`scanpy.tl.louvain` for details.
    """

    from scanpy.tools._utils import _choose_graph
    from scanpy._utils import get_igraph_from_adjacency

    if algorithm == "louvain":
        import louvain

        alg = louvain
    elif algorithm == "leiden":
        import leidenalg

        alg = leidenalg
    else:
        raise ValueError(f"Algorithms should be either 'louvain' or 'leiden', not '{algorithm}'")

    if isinstance(data, AnnData):
        sc_tl_cluster = sc.tl.leiden if algorithm == "leiden" else sc.tl.louvain
        return sc_tl_cluster(
            data,
            resolution=resolution,
            random_state=random_state,
            key_added=key_added,
            neighbors_key=neighbors_key,
            **kwargs,
        )
    elif isinstance(data, MuData):
        mdata = data
    else:
        raise TypeError("Expected a MuData object")

    partition_kwargs = dict(partition_kwargs)

    gs = {}

    for mod in mdata.mod:
        adjacency = _choose_graph(mdata.mod[mod], None, neighbors_key)
        g = get_igraph_from_adjacency(adjacency, directed=directed)

        gs[mod] = g

    if mod_weights:
        if isinstance(mod_weights, Mapping):
            layer_weights = [mod_weights.get(mod, 1) for mod in mdata.mod]
        elif isinstance(mod_weights, Sequence) and not isinstance(mod_weights, str):
            assert len(mod_weights) == len(
                mdata.mod
            ), f"Length of layers_weights ({len(mod_weights)}) does not match the number of modalities ({len(mdata.mod)})"
            layer_weights = mod_weights
        else:
            layer_weights = [mod_weights for _ in mdata.mod]
    else:
        layer_weights = None

    if partition_type is None:
        partition_type = alg.RBConfigurationVertexPartition

    optimiser = alg.Optimiser()
    if random_state:
        optimiser.set_rng_seed(random_state)

    # The same as leiden.find_partition_multiplex() (louvain.find_partition_multiplex())
    # but allows to specify resolution for each modality
    if resolution:
        if isinstance(resolution, Mapping):
            # Specific resolution for each modality
            parts = [
                partition_type(gs[mod], resolution_parameter=resolution[mod], **partition_kwargs)
                for mod in mdata.mod
            ]
        elif isinstance(resolution, Sequence) and not isinstance(resolution, str):
            assert len(resolution) == len(
                mdata.mod
            ), f"Length of resolution ({len(resolution)}) does not match the number of modalities ({len(mdata.mod)})"
            parts = [
                partition_type(gs[mod], resolution_parameter=resolution[i], **partition_kwargs)
                for i, mod in enumerate(mdata.mod)
            ]
        else:
            # Single resolution for all modalities
            parts = [
                partition_type(gs[mod], resolution_parameter=resolution, **partition_kwargs)
                for mod in mdata.mod
            ]
    else:
        parts = [partition_type(gs[mod], **partition_kwargs) for mod in mdata.mod]

    improv = optimiser.optimise_partition_multiplex(
        partitions=parts,
        layer_weights=layer_weights,
        **kwargs,
    )

    # All partitions are the same
    groups = np.array(parts[0].membership)

    mdata.obs[key_added] = pd.Categorical(
        values=groups.astype("U"),
        categories=natsorted(map(str, np.unique(groups))),
    )
    mdata.uns[algorithm] = {}
    mdata.uns[algorithm]["params"] = dict(
        resolution=resolution,
        random_state=random_state,
        partition_improvement=improv,
    )

    return None
Example #2
0
def state_from_blocks(
    adata: AnnData,
    state_key: Optional[str] = 'nsbm',
    neighbors_key: Optional[str] = 'neighbors',
    adjacency: Optional[spmatrix] = None,
    directed: bool = False,
    use_weights: bool = False,
    deg_corr: bool = True,
):
    """
    Returns a gt state object given an AnnData

    Parameters
    ----------
    adata
        The annotated data matrix.
    state_key
        The key under which the state has been saved
    neighbors_key
        The key passed to `sc.pp.neighbors`
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
        
    Returns
    -------
    
    Nothing, adds a `gt.block_state` object in adata.uns        
        
    """
    bl_d = adata.uns['schist'][f'{state_key}']['blocks']
    params = adata.uns['schist'][f'{state_key}']['params']
    if params['model'] == 'nested' or params['model'] == 'multiome_nested':
        blocks = []
        for nl in range(len(bl_d)):
            blocks.append(bl_d[str(nl)])
    else:
        blocks = bl_d['0']
    
    if 'deg_corr' in params:
        deg_corr=params['deg_corr']

    recs=[]
    rec_types=[]
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs=[g.ep.weight]
        rec_types=['real-normal']
        
    if 'recs' in params:
        recs=params['recs']
    if 'rec_types' in params:
        rec_types=params['rec_types']
            
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']

    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    if params['model'] == 'flat':
        state = gt.BlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    elif params['model'] == 'ppbm':
        state = gt.PPBlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    else:
        state = gt.NestedBlockState(g, bs=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    return state            
    
Example #3
0
def louvain(
    adata: AnnData,
    resolution: Optional[float] = None,
    random_state: Optional[Union[int, RandomState]] = 0,
    log_fname: str = '',
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    key_added: Optional[str] = 'louvain',
    adjacency: Optional[spmatrix] = None,
    flavor: str = 'vtraag',
    directed: bool = True,
    use_weights: bool = False,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    partition_kwargs: Optional[Mapping[str, Any]] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_.

    Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation
    of [Traag17]_. The Louvain algorithm has been proposed for single-cell
    analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first,
    or explicitly passing a ``adjacency`` matrix.

    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        For the default flavor (``'vtraag'``), you can provide a resolution
        (higher resolution means finding more and smaller clusters),
        which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_.
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain ``(obs_key, list_of_categories)``.
    key_added
        Key under which to add the cluster labels. (default: ``'louvain'``)
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        ``adata.uns['neighbors']['connectivities']``.
    flavor : {``'vtraag'``, ``'igraph'``}
        Choose between to packages for computing the clustering.
        ``'vtraag'`` is much more powerful, and the default.
    directed
        Interpret the ``adjacency`` matrix as directed graph?
    use_weights
        Use weights from knn graph.
    partition_type
        Type of partition to use.
        Only a valid argument if ``flavor`` is ``'vtraag'``.
    partition_kwargs
        Key word arguments to pass to partitioning,
        if ``vtraag`` method is being used.
    copy
        Copy adata or modify it inplace.

    Returns
    -------
    :obj:`None`
        By default (``copy=False``), updates ``adata`` with the following fields:

        ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``)
            Array of dim (number of samples) that stores the subgroup id
            (``'0'``, ``'1'``, ...) for each cell.

    :class:`~anndata.AnnData`
        When ``copy=True`` is set, a copy of ``adata`` with those fields is returned.
    """
    start = logg.info('running Louvain clustering')
    if (flavor != 'vtraag') and (partition_type is not None):
        raise ValueError(
            '`partition_type` is only a valid argument when `flavour` is "vtraag"'
        )
    adata = adata.copy() if copy else adata
    if adjacency is None and 'neighbors' not in adata.uns:
        raise ValueError(
            'You need to run `pp.neighbors` first to compute a neighborhood graph.'
        )
    if adjacency is None:
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata, restrict_key, restrict_categories, adjacency)
    if flavor in {'vtraag', 'igraph'}:
        if flavor == 'igraph' and resolution is not None:
            logg.warning(
                '`resolution` parameter has no effect for flavor "igraph"')
        if directed and flavor == 'igraph':
            directed = False
        if not directed: logg.debug('    using the undirected graph')
        g = utils.get_igraph_from_adjacency(adjacency, directed=directed)
        if use_weights:
            weights = np.array(g.es["weight"]).astype(np.float64)
        else:
            weights = None
        if flavor == 'vtraag':
            import louvain
            if partition_kwargs is None:
                partition_kwargs = {}
            if partition_type is None:
                partition_type = louvain.RBConfigurationVertexPartition
            if resolution is not None:
                partition_kwargs["resolution_parameter"] = resolution
            if use_weights:
                partition_kwargs["weights"] = weights
            logg.info('    using the "louvain" package of Traag (2017)')
            louvain.set_rng_seed(random_state)
            part = louvain.find_partition(
                g,
                partition_type,
                log_fname=log_fname,
                **partition_kwargs,
            )
            # adata.uns['louvain_quality'] = part.quality()
        else:
            part = g.community_multilevel(weights=weights)
        groups = np.array(part.membership)
    elif flavor == 'taynaud':
        # this is deprecated
        import networkx as nx
        import community
        g = nx.Graph(adjacency)
        partition = community.best_partition(g)
        groups = np.zeros(len(partition), dtype=int)
        for k, v in partition.items():
            groups[k] = v
    else:
        raise ValueError(
            '`flavor` needs to be "vtraag" or "igraph" or "taynaud".')
    if restrict_to is not None:
        if key_added == 'louvain':
            key_added += '_R'
        groups = rename_groups(adata, key_added, restrict_key,
                               restrict_categories, restrict_indices, groups)
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(np.unique(groups).astype('U')),
    )
    adata.uns['louvain'] = {}
    adata.uns['louvain']['params'] = {
        'resolution': resolution,
        'random_state': random_state
    }
    logg.info(
        '    finished',
        time=start,
        #deep=(
        #    f'found {len(np.unique(groups))} clusters and added\n'
        #    f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        #),
    )
    return adata if copy else None
Example #4
0
def planted_model(
    adata: AnnData,
    n_sweep: int = 10,
    beta: float = np.inf,
    tolerance=1e-6,
    collect_marginals: bool = True,
    deg_corr: bool = True,
    samples: int = 100,
    n_jobs: int = -1,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'ppbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    save_model: Union[str, None] = None,
    #    minimize_args: Optional[Dict] = {},
    dispatch_backend: Optional[str] = 'processes',
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Planted Partition Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. This function, in particular, uses
    the Planted Block Model, which is particularly suitable in case of
    assortative graphs and it returns the optimal number of communities

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    n_sweep
        Number of MCMC sweeps to get the initial guess
    beta
        Inverse temperature for the initial MCMC sweep        
    tolerance
        Difference in description length to stop MCMC sweep iterations        
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    samples
        Number of initial minimizations to be performed. This influences also the 
        precision for marginals
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    random_seed
        Random number to be used as seed for graph-tool
    n_jobs
        Number of parallel computations used during model initialization

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['schist']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['schist']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.obsm['CM_ppbm']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['schist']['state']`
        The BlockModel state object
    """

    if random_seed:
        np.random.seed(random_seed)

    seeds = np.random.choice(range(samples**2), size=samples, replace=False)

    if collect_marginals and samples < 100:
        logg.warning(
            'Collecting marginals requires sufficient number of samples\n'
            f'It is now set to {samples} and should be at least 100')

    start = logg.info('minimizing the Planted Partition Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph and graph-tool
    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if samples < 1:
        samples = 1

    # initialize  the block states
    def fast_min(state, beta, n_sweep, fast_tol, seed=None):
        if seed:
            gt.seed_rng(seed)
        dS = 1
        while np.abs(dS) > fast_tol:
            dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep)
        return state

    states = [gt.PPBlockState(g) for x in range(samples)]

    # perform a mcmc sweep on each
    # no list comprehension as I need to collect stats

    states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
        delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x])
        for x in range(samples))
    logg.info('        minimization step done', time=start)
    pmode = gt.PartitionModeState([x.get_blocks().a for x in states],
                                  converge=True)

    bs = pmode.get_max(g)
    logg.info('        consensus step done', time=start)

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    state = gt.PPBlockState(g, b=bs)
    logg.info('    done', time=start)

    groups = np.array(bs.get_array())
    u_groups = np.unique(groups)
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1
    if collect_marginals:
        pv_array = pmode.get_marginal(g).get_2d_array(
            range(last_group)).T[:, u_groups] / samples

    rosetta = dict(zip(u_groups, range(len(u_groups))))
    groups = np.array([rosetta[x] for x in groups])

    if restrict_to is not None:
        if key_added == 'ppbm':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )

    # add column names
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(map(str, np.unique(groups))),
    )

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.obsm[f"CM_{key_added}"] = pv_array

    # add some unstructured info
    if not 'schist' in adata.uns:
        adata.uns['schist'] = {}

    adata.uns['schist'][f'{key_added}'] = {}
    adata.uns['schist'][f'{key_added}']['stats'] = dict(
        entropy=state.entropy(),
        modularity=gt.modularity(g, state.get_blocks()))

    # record state as list of blocks
    # for compatibility with nested model, use a dictionary with a single key here
    # although a np.array would be ok
    adata.uns['schist'][f'{key_added}']['blocks'] = {
        '0': np.array(state.get_blocks().a)
    }

    # last step is recording some parameters used in this analysis
    adata.uns['schist'][f'{key_added}']['params'] = dict(
        model='planted',
        use_weights=use_weights,
        neighbors_key=neighbors_key,
        key_added=key_added,
        samples=samples,
        collect_marginals=collect_marginals,
        random_seed=random_seed,
        deg_corr=deg_corr,
        recs=recs,
        rec_types=rec_types)

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Example #5
0
 def to_igraph(self):
     """Generate igraph from connectiviies.
     """
     return _utils.get_igraph_from_adjacency(self.connectivities)
Example #6
0
def calculate_affinity(adata: AnnData,
                       level: int = 1,
                       block_key: Optional[str] = 'nsbm',
                       group_by: Optional[str] = None,
                       state: Optional = None,
                       neighbors_key: Optional[str] = 'neighbors',
                       adjacency: Optional[sparse.spmatrix] = None,
                       directed: bool = False,
                       use_weights: bool = False,
                       obsp: Optional[str] = None,
                       back_prob: bool = False,
                       copy: bool = False) -> Optional[AnnData]:
    """\
    Calculate cell affinity given a partition scheme. It can be used for 
    partitions calculated using schist or for any partition scheme, given
    for example by cell annotations.
    
    Parameters
    ----------
    adata:
        The AnnData object. Should have been already processed with schist
    level:
        The level to calculate affinity. This parameter is effective
        only for Nested partitions
    block_key:
        The prefix for partitions. This parameter is ignored if the state
        is not gt.NestedBlockState
    group_by:
        The key for group names used for calculations. Setting this will override
        level and block_key. This is effective only for NestedBlockState partitions
    state:
        Optionally calculate affinities on this state.
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    copy:
        Return a new object or do everything in place
        

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with affinity values 
    in adata.obsm[f'CA_{block_key}_level_{level}']
        
"""

    matrix_key = f'CA_{block_key}_level_{level}'  # the default name of the matrix
    if group_by:
        logg.info(f'Calculating cell affinity to {group_by}')
    else:
        logg.info(f'Calculating cell affinity to level {level}')

    if not state:
        # if no state is provided, use the default to retrieve graph
        if 'schist' in adata.uns and 'blocks' in adata.uns['schist'][
                f'{block_key}']:
            params = adata.uns['schist'][f'{block_key}']['params']
            if 'neighbors_key' in params:
                neighbors_key = params['neighbors_key']
            if 'use_weights' in params:
                use_weights = params['use_weights']
            if 'deg_corr' in params:
                deg_corr = params['deg_corr']
            state = state_from_blocks(adata,
                                      state_key=block_key,
                                      neighbors_key=neighbors_key,
                                      adjacency=adjacency,
                                      directed=directed,
                                      use_weights=use_weights,
                                      deg_corr=deg_corr)
            g = state.g
        elif not neighbors_key:
            # no state and no adjacency provided, raise an error
            raise ValueError("A state or an adjacency matrix should be given"
                             "Otherwise a graph cannot be computed")
        else:
            # get the graph from the adjacency
            adjacency = _choose_graph(adata, obsp, neighbors_key)
            g = get_igraph_from_adjacency(adjacency, directed=directed)
            g = g.to_graph_tool()
            gt.remove_parallel_edges(g)
            state = gt.BlockState(g)
    else:
        g = state.g

    if group_by:
        matrix_key = f'CA_{group_by}'
        # if groups are given, we generate a new BlockState and work on that
        if group_by in adata.obs.columns and adata.obs[
                group_by].dtype.name == 'category':
            partitions = adata.obs[group_by].cat.codes.values
            state = gt.BlockState(g, b=partitions)
            if back_prob:
                ca_matrix = get_cell_back_p(state)
            else:
                ca_matrix = get_cell_loglikelihood(state, as_prob=True)
        else:
            raise ValueError(
                f"{group_by} should be a categorical entry in adata.obs")
    else:
        # use precomputed blocks and states
        if type(state) == gt.NestedBlockState:
            if back_prob:
                p0 = get_cell_back_p(state, level=0)
            else:
                p0 = get_cell_loglikelihood(state, level=0, as_prob=True)
            group_col = None
            if group_by and group_by in adata.obs.columns:
                group_col = group_by
            else:
                g_name = f'{block_key}_level_{level}'
                if g_name in adata.obs.columns:
                    group_col = g_name
            if not group_col:
                raise ValueError(
                    "The provided groups or level/blocks do not exist")

            g0 = pd.Categorical(state.project_partition(0, 0).a)
            cross_tab = pd.crosstab(g0,
                                    adata.obs[group_col],
                                    normalize='index')
            ca_matrix = (p0 @ cross_tab).values

        elif type(state) == gt.PPBlockState:
            if back_prob:
                ca_matrix = get_cell_back_p(state)
            else:
                ca_matrix = get_cell_loglikelihood(state, as_prob=True)
            matrix_key = 'CA_ppbm'

    adata.obsm[matrix_key] = ca_matrix

    return adata if copy else None
Example #7
0
def nested_model_multi(
    adatas: List[AnnData],
    deg_corr: bool = True,
    tolerance: float = 1e-6,
    n_sweep: int = 10,
    beta: float = np.inf,
    samples: int = 100,
    collect_marginals: bool = True,
    n_jobs: int = -1,
    *,
    random_seed: Optional[int] = None,
    key_added: str = 'multi_nsbm',
    adjacency: Optional[List[sparse.spmatrix]] = None,
    neighbors_key: Optional[List[str]] = ['neighbors'],
    directed: bool = False,
    use_weights: bool = False,
    save_model: Union[str, None] = None,
    copy: bool = False,
    #    minimize_args: Optional[Dict] = {},
    dispatch_backend: Optional[str] = 'processes',
    #    equilibrate_args: Optional[Dict] = {},
) -> Optional[List[AnnData]]:
    """\
    Cluster cells into subgroups using multiple modalities.

    Cluster cells using the nested Stochastic Block Model [Peixoto14]_,
    performing Bayesian inference on node groups. This function takes multiple
    experiments, possibly across different modalities, and perform joint
    clustering.
    

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first. It also requires cells having the same
    names if coming from paired experiments

    Parameters
    ----------
    adatas
        A list of processed AnnData. Neighbors must have been already
        calculated.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    tolerance
        Tolerance for fast model convergence.
    n_sweep 
        Number of iterations to be performed in the fast model MCMC greedy approach
    beta
        Inverse temperature for MCMC greedy approach    
    samples
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    n_jobs
        Number of parallel computations used during model initialization
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`. If all AnnData share the same key, one
        only has to be specified, otherwise the full tuple of all keys must 
        be provided
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell. 
    `adata.uns['schist']['multi_level_params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['schist']['multi_level_stats']`
        A dict with the values returned by mcmc_sweep
    `adata.obsm['CA_multi_nsbm_level_{n}']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['schist']['multi_level_state']`
        The NestedBlockModel state object
    """

    if random_seed:
        np.random.seed(random_seed)

    seeds = np.random.choice(range(samples**2), size=samples, replace=False)

    if collect_marginals and samples < 100:
        logg.warning(
            'Collecting marginals requires sufficient number of samples\n'
            f'It is now set to {samples} and should be at least 100')

    start = logg.info('minimizing the nested Stochastic Block Model')

    if copy:
        adatas = [x.copy() for x in adatas]

    n_keys = len(neighbors_key)
    n_data = len(adatas)
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        adjacency = []
        if n_keys > 1 and n_keys < n_data:
            raise ValueError(
                'The number of neighbors keys does not match'
                'the number of data matrices. Either fix this'
                'or pass a neighbor key that is shared across all modalities')
        if n_keys == 1:
            neighbors_key = [neighbors_key[0] for x in range(n_data)]
        for x in range(n_data):
            logg.info(f'getting adjacency for data {x}', time=start)
            if neighbors_key[x] not in adatas[x].uns:
                raise ValueError('You need to run `pp.neighbors` first '
                                 'to compute a neighborhood graph. for'
                                 f'data entry {x}')
            elif 'connectivities_key' in adatas[x].uns[neighbors_key[x]]:
                # scanpy>1.4.6 has matrix in another slot
                conn_key = adatas[x].uns[
                    neighbors_key[x]]['connectivities_key']
                adjacency.append(adatas[x].obsp[conn_key])
            else:
                # scanpy<=1.4.6 has sparse matrix here
                adjacency.append(
                    adatas[x].uns[neighbors_key[x]]['connectivities'])

    # convert it to igraph and graph-tool

    graph_list = []
    for x in range(n_data):
        g = get_igraph_from_adjacency(adjacency[x], directed=directed)
        g = g.to_graph_tool()
        gt.remove_parallel_edges(g)
        # add cell names to graph, this will be used to create
        # layered graph
        g_names = g.new_vertex_property('string')
        d_names = adatas[x].obs_names
        for xn in range(len(d_names)):
            g_names[xn] = d_names[xn]
        g.vp['cell'] = g_names
        graph_list.append(g)

# skip weights for now
#    recs=[]
#    rec_types=[]
#    if use_weights:
# this is not ideal to me, possibly we may need to transform
# weights. More tests needed.
#        recs=[g.ep.weight]
#        rec_types=['real-normal']

# get a non-redundant list of all cell names across all modalities
    all_names = set(adatas[0].obs_names)
    [all_names.update(adatas[x].obs_names) for x in range(1, n_data)]
    all_names = list(all_names)
    # create the shared graph
    union_g = gt.Graph(directed=False)
    union_g.add_vertex(len(all_names))
    u_names = union_g.new_vertex_property('string')
    for xn in range(len(all_names)):
        u_names[xn] = all_names[xn]
    union_g.vp['cell'] = u_names

    # now handle in a non elegant way the index mapping across all
    # modalities and the unified Graph

    u_cell_index = dict([(union_g.vp['cell'][x], x)
                         for x in range(union_g.num_vertices())])
    # now create layers
    layer = union_g.new_edge_property('int')
    for ng in range(n_data):
        for e in graph_list[ng].edges():
            S, T = e.source(), e.target()
            Sn = graph_list[ng].vp['cell'][S]
            Tn = graph_list[ng].vp['cell'][T]
            Sidx = u_cell_index[Sn]
            Tidx = u_cell_index[Tn]
            ne = union_g.add_edge(Sidx, Tidx)
            layer[ne] = ng + 1  # this is the layer label

    union_g.ep['layer'] = layer
    # DONE! now proceed with standard minimization, ish

    if samples < 1:
        samples = 1

    states = [
        gt.NestedBlockState(g=union_g,
                            base_type=gt.LayeredBlockState,
                            state_args=dict(deg_corr=deg_corr,
                                            ec=union_g.ep.layer,
                                            layers=True))
        for n in range(samples)
    ]

    def fast_min(state, beta, n_sweep, fast_tol, seed=None):
        if seed:
            gt.seed_rng(seed)
        dS = 1
        while np.abs(dS) > fast_tol:
            dS, _, _ = state.multiflip_mcmc_sweep(beta=beta,
                                                  niter=n_sweep,
                                                  c=0.5)
        return state

    states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
        delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x])
        for x in range(samples))
    logg.info('        minimization step done', time=start)
    pmode = gt.PartitionModeState([x.get_bs() for x in states],
                                  converge=True,
                                  nested=True)
    bs = pmode.get_max_nested()
    logg.info('        consensus step done', time=start)

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    # prune redundant levels at the top
    bs = [x for x in bs if len(np.unique(x)) > 1]
    bs.append(np.array([0],
                       dtype=np.int32))  #in case of type changes, check this
    state = gt.NestedBlockState(union_g,
                                bs=bs,
                                base_type=gt.LayeredBlockState,
                                state_args=dict(deg_corr=deg_corr,
                                                ec=union_g.ep.layer,
                                                layers=True))

    logg.info('    done', time=start)
    u_groups = np.unique(bs[0])
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1

    if collect_marginals:
        # note that the size of this will be equal to the number of the groups in Mode
        # but some entries won't sum to 1 as in the collection there may be differently
        # sized partitions
        pv_array = pmode.get_marginal(union_g).get_2d_array(
            range(last_group)).T[:, u_groups] / samples

    groups = np.zeros((union_g.num_vertices(), len(bs)), dtype=int)

    for x in range(len(bs)):
        # for each level, project labels to the vertex level
        # so that every cell has a name. Note that at this level
        # the labels are not necessarily consecutive
        groups[:, x] = state.project_partition(x, 0).get_array()

    groups = pd.DataFrame(groups).astype('category')

    # rename categories from 0 to n
    for c in groups.columns:
        ncat = len(groups[c].cat.categories)
        new_cat = [u'%s' % x for x in range(ncat)]
        groups[c].cat.rename_categories(new_cat, inplace=True)

    levels = groups.columns

    # recode block names to have consistency with group names
    i_groups = groups.astype(int)
    bs = [i_groups.iloc[:, 0].values]
    for x in range(1, groups.shape[1]):
        bs.append(
            np.where(
                pd.crosstab(i_groups.iloc[:, x - 1], i_groups.iloc[:,
                                                                   x]) > 0)[1])
    state = gt.NestedBlockState(union_g, bs)
    del (i_groups)

    groups.index = all_names

    # add column names
    groups.columns = [f"{key_added}_level_{level}" for level in range(len(bs))]

    # remove any column with the same key
    for xn in range(n_data):
        drop_columns = groups.columns.intersection(adatas[xn].obs.columns)
        adatas[xn].obs.drop(drop_columns, 'columns', inplace=True)
        adatas[xn].obs = pd.concat(
            [adatas[xn].obs, groups.loc[adatas[xn].obs_names]], axis=1)

        # now add marginal probabilities.

        if collect_marginals:
            # add marginals for level 0, the sum up according to the hierarchy
            _groups = groups.loc[adatas[xn].obs_names]
            _pv_array = pd.DataFrame(
                pv_array, index=all_names).loc[adatas[xn].obs_names].values
            adatas[xn].obsm[f"CM_{key_added}_level_0"] = _pv_array
            for group in groups.columns[1:]:
                ct = pd.crosstab(_groups[_groups.columns[0]],
                                 _groups[group],
                                 normalize='index',
                                 dropna=False)
                adatas[xn].obsm[f'CM_{group}'] = _pv_array @ ct.values

        # add some unstructured info
        if not 'schist' in adatas[xn].uns:
            adatas[xn].uns['schist'] = {}

        adatas[xn].uns['schist'][f'{key_added}'] = {}
        adatas[xn].uns['schist'][f'{key_added}']['stats'] = dict(
            level_entropy=np.array(
                [state.level_entropy(x) for x in range(len(state.levels))]),
            modularity=np.array([
                gt.modularity(union_g, state.project_partition(x, 0))
                for x in range(len((state.levels)))
            ]))

        bl_d = {}
        levels = state.get_levels()
        for nl in range(len(levels)):
            bl_d[str(nl)] = np.array(levels[nl].get_blocks().a)
        adatas[xn].uns['schist'][f'{key_added}']['blocks'] = bl_d

        # last step is recording some parameters used in this analysis
        adatas[xn].uns['schist'][f'{key_added}']['params'] = dict(
            model='multiome_nested',
            use_weights=use_weights,
            neighbors_key=neighbors_key[xn],
            key_added=key_added,
            samples=samples,
            collect_marginals=collect_marginals,
            random_seed=random_seed,
            deg_corr=deg_corr,
            #            recs=recs,
            #            rec_types=rec_types
        )

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adatas if copy else None
Example #8
0
def leiden(
    adata: AnnData,
    resolution: float = 1,
    samples: int = 100,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_state: _utils.AnyRandom = 0,
    key_added: str = 'leiden',
    adjacency: Optional[sparse.spmatrix] = None,
    directed: bool = True,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    neighbors_key: Optional[str] = None,
    obsp: Optional[str] = None,
    collect_marginals: bool = True,
    n_jobs: int = -1,
    copy: bool = False,
    save_model: Union[str, None] = None,
    dispatch_backend: Optional[str] = 'processes',
    **partition_kwargs,
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Traag18]_.

    Cluster cells using the Leiden algorithm [Traag18]_,
    an improved version of the Louvain algorithm [Blondel08]_.
    It has been proposed for single-cell analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.


    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        A parameter value controlling the coarseness of the clustering.
        Higher values lead to more clusters.
        Set to `None` if overriding `partition_type`
        to one that doesn’t accept a `resolution_parameter`.
	samples
    samples
	The number of random samples to take for consensus        
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain `(obs_key, list_of_categories)`.
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    n_iterations
        How many iterations of the Leiden clustering algorithm to perform.
        Positive values above 2 define the total number of iterations to perform,
        -1 has the algorithm run until it reaches its optimal clustering.
    partition_type
        Type of partition to use.
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`.
        For the available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    obsp
        Use .obsp[obsp] as adjacency. You can't specify both
        `obsp` and `neighbors_key` at the same time.
    collect_marginals
    	Wheter to retrieve the marginal probability to belong to a group
    n_jobs
        Number of parallel jobs to calculate partitions
    copy
        Whether to copy `adata` or modify it inplace.
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    **partition_kwargs
        Any further arguments to pass to `~leidenalg.find_partition`
        (which in turn passes arguments to the `partition_type`).


    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['leiden']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    """
    try:
        import leidenalg
    except ImportError:
        raise ImportError(
            'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.'
        )
    partition_kwargs = dict(partition_kwargs)

    start = logg.info('running Leiden clustering')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        adjacency = _choose_graph(adata, obsp, neighbors_key)
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g_gt = g.to_graph_tool()
    gt.remove_parallel_edges(g_gt)
    # flip to the default partition type if not overriden by the user
    if partition_type is None:
        partition_type = leidenalg.RBConfigurationVertexPartition
    # Prepare find_partition arguments as a dictionary,
    # appending to whatever the user provided. It needs to be this way
    # as this allows for the accounting of a None resolution
    # (in the case of a partition variant that doesn't take it on input)
    if use_weights:
        partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
    partition_kwargs['n_iterations'] = n_iterations
    np.random.seed(random_state)
    seeds = np.random.choice(range(0, samples**2), size=samples, replace=False)
    

    if resolution is not None:
        partition_kwargs['resolution_parameter'] = resolution
    # clustering proper
    def membership(g, partition_type, seed, **partition_kwargs):
        return leidenalg.find_partition(g, partition_type, 
                                        seed=seed, **partition_kwargs).membership
    
    parts = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
                    delayed(membership)(g, partition_type, 
                                        seeds[i], **partition_kwargs) 
                                        for i in range(samples))

    pmode = gt.PartitionModeState(parts, converge=True) 

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')    
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    groups = np.array(pmode.get_max(g_gt).get_array())     
    u_groups = np.unique(groups)
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1
    if collect_marginals:
        pv_array = pmode.get_marginal(g_gt).get_2d_array(range(last_group)).T[:, u_groups] / samples
    # rename groups to ensure they are a continuous range
    rosetta = dict(zip(u_groups, range(len(u_groups))))
    groups = np.array([rosetta[x] for x in groups])

    # store output into adata.obs
        
    if restrict_to is not None:
        if key_added == 'leiden':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(map(str, np.unique(groups))),
    )
    if collect_marginals:
        adata.obsm[f"CM_{key_added}"] = pv_array
    # store information on the clustering parameters
    adata.uns['leiden'] = {}
    adata.uns['leiden']['params'] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
        samples=samples,
        collect_marginals=collect_marginals
    )
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        ),
    )
    return adata if copy else None