Example #1
0
def louvain(
    adata: AnnData,
    resolution: Optional[float] = None,
    random_state: Optional[Union[int, RandomState]] = 0,
    log_fname: str = '',
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    key_added: Optional[str] = 'louvain',
    adjacency: Optional[spmatrix] = None,
    flavor: str = 'vtraag',
    directed: bool = True,
    use_weights: bool = False,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    partition_kwargs: Optional[Mapping[str, Any]] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_.

    Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation
    of [Traag17]_. The Louvain algorithm has been proposed for single-cell
    analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first,
    or explicitly passing a ``adjacency`` matrix.

    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        For the default flavor (``'vtraag'``), you can provide a resolution
        (higher resolution means finding more and smaller clusters),
        which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_.
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain ``(obs_key, list_of_categories)``.
    key_added
        Key under which to add the cluster labels. (default: ``'louvain'``)
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        ``adata.uns['neighbors']['connectivities']``.
    flavor : {``'vtraag'``, ``'igraph'``}
        Choose between to packages for computing the clustering.
        ``'vtraag'`` is much more powerful, and the default.
    directed
        Interpret the ``adjacency`` matrix as directed graph?
    use_weights
        Use weights from knn graph.
    partition_type
        Type of partition to use.
        Only a valid argument if ``flavor`` is ``'vtraag'``.
    partition_kwargs
        Key word arguments to pass to partitioning,
        if ``vtraag`` method is being used.
    copy
        Copy adata or modify it inplace.

    Returns
    -------
    :obj:`None`
        By default (``copy=False``), updates ``adata`` with the following fields:

        ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``)
            Array of dim (number of samples) that stores the subgroup id
            (``'0'``, ``'1'``, ...) for each cell.

    :class:`~anndata.AnnData`
        When ``copy=True`` is set, a copy of ``adata`` with those fields is returned.
    """
    start = logg.info('running Louvain clustering')
    if (flavor != 'vtraag') and (partition_type is not None):
        raise ValueError(
            '`partition_type` is only a valid argument when `flavour` is "vtraag"'
        )
    adata = adata.copy() if copy else adata
    if adjacency is None and 'neighbors' not in adata.uns:
        raise ValueError(
            'You need to run `pp.neighbors` first to compute a neighborhood graph.'
        )
    if adjacency is None:
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata, restrict_key, restrict_categories, adjacency)
    if flavor in {'vtraag', 'igraph'}:
        if flavor == 'igraph' and resolution is not None:
            logg.warning(
                '`resolution` parameter has no effect for flavor "igraph"')
        if directed and flavor == 'igraph':
            directed = False
        if not directed: logg.debug('    using the undirected graph')
        g = utils.get_igraph_from_adjacency(adjacency, directed=directed)
        if use_weights:
            weights = np.array(g.es["weight"]).astype(np.float64)
        else:
            weights = None
        if flavor == 'vtraag':
            import louvain
            if partition_kwargs is None:
                partition_kwargs = {}
            if partition_type is None:
                partition_type = louvain.RBConfigurationVertexPartition
            if resolution is not None:
                partition_kwargs["resolution_parameter"] = resolution
            if use_weights:
                partition_kwargs["weights"] = weights
            logg.info('    using the "louvain" package of Traag (2017)')
            louvain.set_rng_seed(random_state)
            part = louvain.find_partition(
                g,
                partition_type,
                log_fname=log_fname,
                **partition_kwargs,
            )
            # adata.uns['louvain_quality'] = part.quality()
        else:
            part = g.community_multilevel(weights=weights)
        groups = np.array(part.membership)
    elif flavor == 'taynaud':
        # this is deprecated
        import networkx as nx
        import community
        g = nx.Graph(adjacency)
        partition = community.best_partition(g)
        groups = np.zeros(len(partition), dtype=int)
        for k, v in partition.items():
            groups[k] = v
    else:
        raise ValueError(
            '`flavor` needs to be "vtraag" or "igraph" or "taynaud".')
    if restrict_to is not None:
        if key_added == 'louvain':
            key_added += '_R'
        groups = rename_groups(adata, key_added, restrict_key,
                               restrict_categories, restrict_indices, groups)
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(np.unique(groups).astype('U')),
    )
    adata.uns['louvain'] = {}
    adata.uns['louvain']['params'] = {
        'resolution': resolution,
        'random_state': random_state
    }
    logg.info(
        '    finished',
        time=start,
        #deep=(
        #    f'found {len(np.unique(groups))} clusters and added\n'
        #    f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        #),
    )
    return adata if copy else None
Example #2
0
def planted_model(
    adata: AnnData,
    n_sweep: int = 10,
    beta: float = np.inf,
    tolerance=1e-6,
    collect_marginals: bool = True,
    deg_corr: bool = True,
    samples: int = 100,
    n_jobs: int = -1,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'ppbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    save_model: Union[str, None] = None,
    #    minimize_args: Optional[Dict] = {},
    dispatch_backend: Optional[str] = 'processes',
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Planted Partition Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. This function, in particular, uses
    the Planted Block Model, which is particularly suitable in case of
    assortative graphs and it returns the optimal number of communities

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    n_sweep
        Number of MCMC sweeps to get the initial guess
    beta
        Inverse temperature for the initial MCMC sweep        
    tolerance
        Difference in description length to stop MCMC sweep iterations        
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    samples
        Number of initial minimizations to be performed. This influences also the 
        precision for marginals
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    random_seed
        Random number to be used as seed for graph-tool
    n_jobs
        Number of parallel computations used during model initialization

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['schist']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['schist']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.obsm['CM_ppbm']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['schist']['state']`
        The BlockModel state object
    """

    if random_seed:
        np.random.seed(random_seed)

    seeds = np.random.choice(range(samples**2), size=samples, replace=False)

    if collect_marginals and samples < 100:
        logg.warning(
            'Collecting marginals requires sufficient number of samples\n'
            f'It is now set to {samples} and should be at least 100')

    start = logg.info('minimizing the Planted Partition Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph and graph-tool
    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if samples < 1:
        samples = 1

    # initialize  the block states
    def fast_min(state, beta, n_sweep, fast_tol, seed=None):
        if seed:
            gt.seed_rng(seed)
        dS = 1
        while np.abs(dS) > fast_tol:
            dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep)
        return state

    states = [gt.PPBlockState(g) for x in range(samples)]

    # perform a mcmc sweep on each
    # no list comprehension as I need to collect stats

    states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
        delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x])
        for x in range(samples))
    logg.info('        minimization step done', time=start)
    pmode = gt.PartitionModeState([x.get_blocks().a for x in states],
                                  converge=True)

    bs = pmode.get_max(g)
    logg.info('        consensus step done', time=start)

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    state = gt.PPBlockState(g, b=bs)
    logg.info('    done', time=start)

    groups = np.array(bs.get_array())
    u_groups = np.unique(groups)
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1
    if collect_marginals:
        pv_array = pmode.get_marginal(g).get_2d_array(
            range(last_group)).T[:, u_groups] / samples

    rosetta = dict(zip(u_groups, range(len(u_groups))))
    groups = np.array([rosetta[x] for x in groups])

    if restrict_to is not None:
        if key_added == 'ppbm':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )

    # add column names
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(map(str, np.unique(groups))),
    )

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.obsm[f"CM_{key_added}"] = pv_array

    # add some unstructured info
    if not 'schist' in adata.uns:
        adata.uns['schist'] = {}

    adata.uns['schist'][f'{key_added}'] = {}
    adata.uns['schist'][f'{key_added}']['stats'] = dict(
        entropy=state.entropy(),
        modularity=gt.modularity(g, state.get_blocks()))

    # record state as list of blocks
    # for compatibility with nested model, use a dictionary with a single key here
    # although a np.array would be ok
    adata.uns['schist'][f'{key_added}']['blocks'] = {
        '0': np.array(state.get_blocks().a)
    }

    # last step is recording some parameters used in this analysis
    adata.uns['schist'][f'{key_added}']['params'] = dict(
        model='planted',
        use_weights=use_weights,
        neighbors_key=neighbors_key,
        key_added=key_added,
        samples=samples,
        collect_marginals=collect_marginals,
        random_seed=random_seed,
        deg_corr=deg_corr,
        recs=recs,
        rec_types=rec_types)

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Example #3
0
def nested_model(
    adata: AnnData,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    hierarchy_length: int = 10,
    deg_corr: bool = True,
    multiflip: bool = True,
    fast_model: bool = False,
    fast_tol: float = 1e-6,
    n_sweep: int = 10,
    beta: float = np.inf,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 1000.),
    steps_anneal: int = 3,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'nsbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    prune: bool = False,
    return_low: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the nested Stochastic Block Model [Peixoto14]_,
    a hierarchical version of Stochastic Block Model [Holland83]_, performing
    Bayesian inference on node groups. NSBM should circumvent classical
    limitations of SBM in detecting small groups in large graphs
    replacing the noninformative priors used by a hierarchy of priors
    and hyperpriors.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    hierarchy_length
        Initial length of the hierarchy. When large values are
        passed, the top-most levels will be uninformative as they
        will likely contain the very same groups. Increase this valus
        if a very large number of cells is analyzed (>100.000).
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    multiflip
        Whether to perform MCMC sweep with multiple simultaneous moves to sample
        network partitions. It may result in slightly longer runtimes, but under
        the hood it allows for a more efficient space exploration.
    fast_model
        Whether to skip initial minization step and let the MCMC find a solution. 
        This approach tend to be faster and consume less memory, but may be
        less accurate.
    fast_tol
        Tolerance for fast model convergence.
    n_sweep 
        Number of iterations to be performed in the fast model MCMC greedy approach
    beta
        Inverse temperature for MCMC greedy approach    
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    prune
        Some high levels in hierarchy may contain the same information in terms of 
        cell assignments, even if they apparently have different group names. When this
        option is set to `True`, the function only returns informative levels.
        Note, however, that cell affinities are still reported for all levels. Pruning
        does not rename group levels
    return_low
        Whether or not return nsbm_level_0 in adata.obs. This level usually contains
        so many groups that it cannot be plot anyway, but it may be useful for particular
        analysis. By default it is not returned
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell. 
    `adata.uns['nsbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['nsbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['nsbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['nsbm']['state']`
        The NestedBlockModel state object
    """

    if resume:
        # if the fast_model is chosen perform equilibration anyway
        # also if a model has previously created
        equilibrate = True

    if resume and ('nsbm' not in adata.uns
                   or 'state' not in adata.uns['nsbm']):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume = False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`")

    start = logg.info('minimizing the nested Stochastic Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if n_init < 1:
        n_init = 1

    if fast_model:
        # do not minimize, start with a dummy state and perform only equilibrate

        states = [
            gt.NestedBlockState(g=g,
                                state_args=dict(deg_corr=deg_corr,
                                                recs=recs,
                                                rec_types=rec_types))
            for n in range(n_init)
        ]
        for x in range(n_init):
            dS = 1
            while np.abs(dS) > fast_tol:
                # perform sweep until a tolerance is reached
                dS, _, _ = states[x].multiflip_mcmc_sweep(beta=beta,
                                                          niter=n_sweep)

        _amin = np.argmin([s.entropy() for s in states])
        state = states[_amin]

        #        dS = 1
        #        while np.abs(dS) > fast_tol:
        #            dS, nattempts, nmoves = state.multiflip_mcmc_sweep(niter=10, beta=np.inf)
        bs = state.get_bs()
        logg.info('    done', time=start)

    elif resume:
        # create the state and make sure sampling is performed
        state = adata.uns['nsbm']['state'].copy(sampling=True)
        bs = state.get_bs()
        # get the graph from state
        g = state.g
    else:

        states = [
            gt.minimize_nested_blockmodel_dl(
                g,
                deg_corr=deg_corr,
                state_args=dict(recs=recs, rec_types=rec_types),
                **minimize_args) for n in range(n_init)
        ]

        state = states[np.argmin([s.entropy() for s in states])]
        #        state = gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr,
        #                                                 state_args=dict(recs=recs,
        #                                                 rec_types=rec_types),
        #                                                 **minimize_args)
        logg.info('    done', time=start)
        bs = state.get_bs()
        if len(bs) <= hierarchy_length:
            # increase hierarchy length up to the specified value
            # according to Tiago Peixoto 10 is reasonably large as number of
            # groups decays exponentially
            bs += [np.zeros(1)] * (hierarchy_length - len(bs))
        else:
            logg.warning(
                f'A hierarchy length of {hierarchy_length} has been specified\n'
                f'but the minimized model contains {len(bs)} levels')
            pass
        # create a new state with inferred blocks
        state = gt.NestedBlockState(g,
                                    bs,
                                    state_args=dict(recs=recs,
                                                    rec_types=rec_types),
                                    sampling=True)

    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        # equlibration done by simulated annealing

        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['multiflip'] = multiflip
        equilibrate_args['mcmc_args'] = {'niter': 10}

        dS, nattempts, nmoves = gt.mcmc_anneal(
            state,
            mcmc_equilibrate_args=equilibrate_args,
            niter=steps_anneal,
            beta_range=beta_range)
    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = [
            np.zeros(g.num_vertices() + 1) for s in state.get_levels()
        ]

        def _collect_marginals(s):
            levels = s.get_levels()
            for l, sl in enumerate(levels):
                group_marginals[l][sl.get_nonempty_B()] += 1

        gt.mcmc_equilibrate(state,
                            wait=wait,
                            nbreaks=nbreaks,
                            epsilon=epsilon,
                            max_niter=max_iterations,
                            multiflip=True,
                            force_niter=niter_collect,
                            mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = np.zeros((g.num_vertices(), len(bs)), dtype=int)

    for x in range(len(bs)):
        # for each level, project labels to the vertex level
        # so that every cell has a name. Note that at this level
        # the labels are not necessarily consecutive
        groups[:, x] = state.project_partition(x, 0).get_array()

    groups = pd.DataFrame(groups).astype('category')

    # rename categories from 0 to n
    for c in groups.columns:
        new_cat_names = dict([
            (cx, u'%s' % cn)
            for cn, cx in enumerate(groups.loc[:, c].cat.categories)
        ])
        groups.loc[:, c].cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    groups.columns = [
        "%s_level_%d" % (key_added, level) for level in range(len(bs))
    ]

    # remove any column with the same key
    keep_columns = [
        x for x in adata.obs.columns
        if not x.startswith('%s_level_' % key_added)
    ]
    adata.obs = adata.obs.loc[:, keep_columns]
    # concatenate obs with new data, skipping level_0 which is usually
    # crap. In the future it may be useful to reintegrate it
    # we need it in this function anyway, to match groups with node marginals
    if return_low:
        adata.obs = pd.concat([adata.obs, groups], axis=1)
    else:
        adata.obs = pd.concat([adata.obs, groups.iloc[:, 1:]], axis=1)

    # add some unstructured info

    adata.uns['nsbm'] = {}
    adata.uns['nsbm']['stats'] = dict(level_entropy=np.array(
        [state.level_entropy(x) for x in range(len(state.levels))]),
                                      modularity=np.array([
                                          gt.modularity(
                                              g, state.project_partition(x, 0))
                                          for x in range(len((state.levels)))
                                      ]))
    if equilibrate:
        adata.uns['nsbm']['stats']['dS'] = dS
        adata.uns['nsbm']['stats']['nattempts'] = nattempts
        adata.uns['nsbm']['stats']['nmoves'] = nmoves

    adata.uns['nsbm']['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # refrain group marginals. We collected data in vector as long as
        # the number of cells, cut them into appropriate length data
        adata.uns['nsbm']['group_marginals'] = {}
        for nl, level_marginals in enumerate(group_marginals):
            idx = np.where(level_marginals > 0)[0] + 1
            adata.uns['nsbm']['group_marginals'][nl] = np.array(
                level_marginals[:np.max(idx)])

    # prune uninformative levels, if any
    if prune:
        to_remove = prune_groups(groups)
        logg.info(f'    Removing levels f{to_remove}')
        adata.obs.drop(to_remove, axis='columns', inplace=True)

    # calculate log-likelihood of cell moves over the remaining levels
    # we have to calculate events at level 0 and propagate to upper levels
    logg.info('    calculating cell affinity to groups')
    levels = [
        int(x.split('_')[-1]) for x in adata.obs.columns
        if x.startswith(f'{key_added}_level')
    ]
    adata.uns['nsbm']['cell_affinity'] = dict.fromkeys(
        [str(x) for x in levels])
    p0 = get_cell_loglikelihood(state, level=0, as_prob=True)

    adata.uns['nsbm']['cell_affinity'][0] = p0
    l0 = "%s_level_0" % key_added
    for nl, level in enumerate(groups.columns[1:]):
        cross_tab = pd.crosstab(groups.loc[:, l0], groups.loc[:, level])
        cl = np.zeros((p0.shape[0], cross_tab.shape[1]), dtype=p0.dtype)
        for x in range(cl.shape[1]):
            # sum counts of level_0 groups corresponding to
            # this group at current level
            cl[:, x] = p0[:, np.where(cross_tab.iloc[:, x] > 0)[0]].sum(axis=1)
        adata.uns['nsbm']['cell_affinity'][str(nl + 1)] = cl / np.sum(
            cl, axis=1)[:, None]

    # last step is recording some parameters used in this analysis
    adata.uns['nsbm']['params'] = dict(
        epsilon=epsilon,
        wait=wait,
        nbreaks=nbreaks,
        equilibrate=equilibrate,
        fast_model=fast_model,
        collect_marginals=collect_marginals,
        hierarchy_length=hierarchy_length,
        random_seed=random_seed,
        prune=prune,
    )

    logg.info(
        '    finished',
        time=start,
        deep=
        (f'found {state.get_levels()[1].get_nonempty_B()} clusters at level_1, and added\n'
         f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Example #4
0
def planted_model(
    adata: AnnData,
    n_sweep: int = 10,
    beta: float = np.inf,
    tolerance=1e-6,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    deg_corr: bool = True,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 100.),
    steps_anneal: int = 5,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'ppbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Stochastic Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. This function, in particular, uses
    the Planted Block Model, which is particularly suitable in case of
    assortative graphs and it returns the optimal number of communities

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    n_sweep
        Number of MCMC sweeps to get the initial guess
    beta
        Inverse temperature for the initial MCMC sweep        
    tolerance
        Difference in description length to stop MCMC sweep iterations        
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['sbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['sbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['sbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['sbm']['state']`
        The BlockModel state object
    """

    # first things first
    check_gt_version()

    if resume:
        equilibrate = True

    if resume and (key_added not in adata.uns
                   or 'state' not in adata.uns[key_added]):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume = False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`")

    start = logg.info('minimizing the Planted Partition Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if resume:
        # create the state and make sure sampling is performed
        state = adata.uns[key_added]['state'].copy()
        g = state.g
    else:
        if n_init < 1:
            n_init = 1

        # initialize  the block states
        states = [gt.PPBlockState(g) for x in range(n_init)]

        # perform a mcmc sweep on each
        # no list comprehension as I need to collect stats

        _dS = np.zeros(n_init)
        _nattempts = np.zeros(n_init)
        _nmoves = np.zeros(n_init)
        for x in range(n_init):
            t_ds = 1
            while np.abs(t_ds) > tolerance:
                # perform sweep until a tolerance is reached
                t_ds, t_natt, t_nm = states[x].multiflip_mcmc_sweep(
                    beta=beta, niter=n_sweep)
                _dS[x] += t_ds
                _nattempts[x] += t_natt
                _nmoves[x] += t_nm

        _amin = np.argmin([s.entropy() for s in states])
        state = states[_amin]
        dS = _dS[_amin]
        nattempts = _nattempts[_amin]
        nmoves = _nmoves[_amin]

        logg.info('    done', time=start)

    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['mcmc_args'] = {'niter': 10}

        dS, nattempts, nmoves = gt.mcmc_anneal(
            state,
            mcmc_equilibrate_args=equilibrate_args,
            niter=steps_anneal,
            beta_range=beta_range)

    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = np.zeros(g.num_vertices() + 1)

        def _collect_marginals(s):
            group_marginals[s.get_B()] += 1

        gt.mcmc_equilibrate(state,
                            wait=wait,
                            nbreaks=nbreaks,
                            epsilon=epsilon,
                            max_niter=max_iterations,
                            multiflip=True,
                            force_niter=niter_collect,
                            mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = pd.Series(state.get_blocks().get_array()).astype('category')
    new_cat_names = dict([(cx, u'%s' % cn)
                          for cn, cx in enumerate(groups.cat.categories)])
    groups.cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    adata.obs.loc[:, key_added] = groups

    # add some unstructured info

    adata.uns[key_added] = {}
    adata.uns[key_added]['stats'] = dict(dS=dS,
                                         nattempts=nattempts,
                                         nmoves=nmoves,
                                         modularity=gt.modularity(
                                             g, state.get_blocks()))
    adata.uns[key_added]['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.uns[key_added]['group_marginals'] = group_marginals

    # calculate log-likelihood of cell moves over the remaining levels

    # adata.uns[key_added]['cell_affinity'] = {'1':get_cell_loglikelihood(state, as_prob=True, rescale=True)}

    # last step is recording some parameters used in this analysis
    adata.uns[key_added]['params'] = dict(epsilon=epsilon,
                                          wait=wait,
                                          nbreaks=nbreaks,
                                          equilibrate=equilibrate,
                                          collect_marginals=collect_marginals,
                                          random_seed=random_seed)

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Example #5
0
def leiden(
    adata: AnnData,
    resolution: float = 1,
    samples: int = 100,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_state: _utils.AnyRandom = 0,
    key_added: str = 'leiden',
    adjacency: Optional[sparse.spmatrix] = None,
    directed: bool = True,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    neighbors_key: Optional[str] = None,
    obsp: Optional[str] = None,
    collect_marginals: bool = True,
    n_jobs: int = -1,
    copy: bool = False,
    save_model: Union[str, None] = None,
    dispatch_backend: Optional[str] = 'processes',
    **partition_kwargs,
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Traag18]_.

    Cluster cells using the Leiden algorithm [Traag18]_,
    an improved version of the Louvain algorithm [Blondel08]_.
    It has been proposed for single-cell analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.


    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        A parameter value controlling the coarseness of the clustering.
        Higher values lead to more clusters.
        Set to `None` if overriding `partition_type`
        to one that doesn’t accept a `resolution_parameter`.
	samples
    samples
	The number of random samples to take for consensus        
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain `(obs_key, list_of_categories)`.
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    n_iterations
        How many iterations of the Leiden clustering algorithm to perform.
        Positive values above 2 define the total number of iterations to perform,
        -1 has the algorithm run until it reaches its optimal clustering.
    partition_type
        Type of partition to use.
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`.
        For the available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    obsp
        Use .obsp[obsp] as adjacency. You can't specify both
        `obsp` and `neighbors_key` at the same time.
    collect_marginals
    	Wheter to retrieve the marginal probability to belong to a group
    n_jobs
        Number of parallel jobs to calculate partitions
    copy
        Whether to copy `adata` or modify it inplace.
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    **partition_kwargs
        Any further arguments to pass to `~leidenalg.find_partition`
        (which in turn passes arguments to the `partition_type`).


    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['leiden']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    """
    try:
        import leidenalg
    except ImportError:
        raise ImportError(
            'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.'
        )
    partition_kwargs = dict(partition_kwargs)

    start = logg.info('running Leiden clustering')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        adjacency = _choose_graph(adata, obsp, neighbors_key)
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g_gt = g.to_graph_tool()
    gt.remove_parallel_edges(g_gt)
    # flip to the default partition type if not overriden by the user
    if partition_type is None:
        partition_type = leidenalg.RBConfigurationVertexPartition
    # Prepare find_partition arguments as a dictionary,
    # appending to whatever the user provided. It needs to be this way
    # as this allows for the accounting of a None resolution
    # (in the case of a partition variant that doesn't take it on input)
    if use_weights:
        partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
    partition_kwargs['n_iterations'] = n_iterations
    np.random.seed(random_state)
    seeds = np.random.choice(range(0, samples**2), size=samples, replace=False)
    

    if resolution is not None:
        partition_kwargs['resolution_parameter'] = resolution
    # clustering proper
    def membership(g, partition_type, seed, **partition_kwargs):
        return leidenalg.find_partition(g, partition_type, 
                                        seed=seed, **partition_kwargs).membership
    
    parts = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
                    delayed(membership)(g, partition_type, 
                                        seeds[i], **partition_kwargs) 
                                        for i in range(samples))

    pmode = gt.PartitionModeState(parts, converge=True) 

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')    
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    groups = np.array(pmode.get_max(g_gt).get_array())     
    u_groups = np.unique(groups)
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1
    if collect_marginals:
        pv_array = pmode.get_marginal(g_gt).get_2d_array(range(last_group)).T[:, u_groups] / samples
    # rename groups to ensure they are a continuous range
    rosetta = dict(zip(u_groups, range(len(u_groups))))
    groups = np.array([rosetta[x] for x in groups])

    # store output into adata.obs
        
    if restrict_to is not None:
        if key_added == 'leiden':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(map(str, np.unique(groups))),
    )
    if collect_marginals:
        adata.obsm[f"CM_{key_added}"] = pv_array
    # store information on the clustering parameters
    adata.uns['leiden'] = {}
    adata.uns['leiden']['params'] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
        samples=samples,
        collect_marginals=collect_marginals
    )
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        ),
    )
    return adata if copy else None