Example #1
0
def draw_lesmis():
    g = gt.collection.data["lesmis"]

    state = gt.BlockState(g, B=20)  # This automatically initializes the state
    # with a random partition into B=20
    # nonempty groups; The user could
    # also pass an arbitrary initial
    # partition using the 'b' parameter.

    # If we work with the above state object, we will be restricted to
    # partitions into at most B=20 groups. But since we want to consider
    # an arbitrary number of groups in the range [1, N], we transform it
    # into a state with B=N groups (where N-20 will be empty).

    print('num v:', g.num_vertices())
    state = state.copy(B=g.num_vertices())
    # Now we run 1,000 sweeps of the MCMC

    dS, nmoves = state.mcmc_sweep(niter=1000)

    print("Change in description length:", dS)
    print("Number of accepted vertex moves:", nmoves)

    gt.mcmc_equilibrate(state, wait=1000, mcmc_args=dict(niter=10))

    def collect_marginals(s):
        global pv
        pv = s.collect_vertex_marginals(pv)
        print(pv)

    # Now we collect the marginals for exactly 100,000 sweeps
    gt.mcmc_equilibrate(state,
                        force_niter=10000,
                        mcmc_args=dict(niter=10),
                        callback=collect_marginals)

    print(g.vp.pos)
    # Now the node marginals are stored in property map pv. We can
    # visualize them as pie charts on the nodes:
    state.draw(pos=g.vp.pos,
               vertex_shape="pie",
               vertex_pie_fractions=pv,
               edge_gradient=None,
               output="lesmis-sbm-marginals.pdf")
Example #2
0
def state_from_blocks(
    adata: AnnData,
    state_key: Optional[str] = 'nsbm',
    neighbors_key: Optional[str] = 'neighbors',
    adjacency: Optional[spmatrix] = None,
    directed: bool = False,
    use_weights: bool = False,
    deg_corr: bool = True,
):
    """
    Returns a gt state object given an AnnData

    Parameters
    ----------
    adata
        The annotated data matrix.
    state_key
        The key under which the state has been saved
    neighbors_key
        The key passed to `sc.pp.neighbors`
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
        
    Returns
    -------
    
    Nothing, adds a `gt.block_state` object in adata.uns        
        
    """
    bl_d = adata.uns['schist'][f'{state_key}']['blocks']
    params = adata.uns['schist'][f'{state_key}']['params']
    if params['model'] == 'nested' or params['model'] == 'multiome_nested':
        blocks = []
        for nl in range(len(bl_d)):
            blocks.append(bl_d[str(nl)])
    else:
        blocks = bl_d['0']
    
    if 'deg_corr' in params:
        deg_corr=params['deg_corr']

    recs=[]
    rec_types=[]
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs=[g.ep.weight]
        rec_types=['real-normal']
        
    if 'recs' in params:
        recs=params['recs']
    if 'rec_types' in params:
        rec_types=params['rec_types']
            
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']

    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    if params['model'] == 'flat':
        state = gt.BlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    elif params['model'] == 'ppbm':
        state = gt.PPBlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    else:
        state = gt.NestedBlockState(g, bs=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    return state            
    
Example #3
0
def calculate_affinity(adata: AnnData,
                       level: int = 1,
                       block_key: Optional[str] = 'nsbm',
                       group_by: Optional[str] = None,
                       state: Optional = None,
                       neighbors_key: Optional[str] = 'neighbors',
                       adjacency: Optional[sparse.spmatrix] = None,
                       directed: bool = False,
                       use_weights: bool = False,
                       obsp: Optional[str] = None,
                       back_prob: bool = False,
                       copy: bool = False) -> Optional[AnnData]:
    """\
    Calculate cell affinity given a partition scheme. It can be used for 
    partitions calculated using schist or for any partition scheme, given
    for example by cell annotations.
    
    Parameters
    ----------
    adata:
        The AnnData object. Should have been already processed with schist
    level:
        The level to calculate affinity. This parameter is effective
        only for Nested partitions
    block_key:
        The prefix for partitions. This parameter is ignored if the state
        is not gt.NestedBlockState
    group_by:
        The key for group names used for calculations. Setting this will override
        level and block_key. This is effective only for NestedBlockState partitions
    state:
        Optionally calculate affinities on this state.
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    copy:
        Return a new object or do everything in place
        

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with affinity values 
    in adata.obsm[f'CA_{block_key}_level_{level}']
        
"""

    matrix_key = f'CA_{block_key}_level_{level}'  # the default name of the matrix
    if group_by:
        logg.info(f'Calculating cell affinity to {group_by}')
    else:
        logg.info(f'Calculating cell affinity to level {level}')

    if not state:
        # if no state is provided, use the default to retrieve graph
        if 'schist' in adata.uns and 'blocks' in adata.uns['schist'][
                f'{block_key}']:
            params = adata.uns['schist'][f'{block_key}']['params']
            if 'neighbors_key' in params:
                neighbors_key = params['neighbors_key']
            if 'use_weights' in params:
                use_weights = params['use_weights']
            if 'deg_corr' in params:
                deg_corr = params['deg_corr']
            state = state_from_blocks(adata,
                                      state_key=block_key,
                                      neighbors_key=neighbors_key,
                                      adjacency=adjacency,
                                      directed=directed,
                                      use_weights=use_weights,
                                      deg_corr=deg_corr)
            g = state.g
        elif not neighbors_key:
            # no state and no adjacency provided, raise an error
            raise ValueError("A state or an adjacency matrix should be given"
                             "Otherwise a graph cannot be computed")
        else:
            # get the graph from the adjacency
            adjacency = _choose_graph(adata, obsp, neighbors_key)
            g = get_igraph_from_adjacency(adjacency, directed=directed)
            g = g.to_graph_tool()
            gt.remove_parallel_edges(g)
            state = gt.BlockState(g)
    else:
        g = state.g

    if group_by:
        matrix_key = f'CA_{group_by}'
        # if groups are given, we generate a new BlockState and work on that
        if group_by in adata.obs.columns and adata.obs[
                group_by].dtype.name == 'category':
            partitions = adata.obs[group_by].cat.codes.values
            state = gt.BlockState(g, b=partitions)
            if back_prob:
                ca_matrix = get_cell_back_p(state)
            else:
                ca_matrix = get_cell_loglikelihood(state, as_prob=True)
        else:
            raise ValueError(
                f"{group_by} should be a categorical entry in adata.obs")
    else:
        # use precomputed blocks and states
        if type(state) == gt.NestedBlockState:
            if back_prob:
                p0 = get_cell_back_p(state, level=0)
            else:
                p0 = get_cell_loglikelihood(state, level=0, as_prob=True)
            group_col = None
            if group_by and group_by in adata.obs.columns:
                group_col = group_by
            else:
                g_name = f'{block_key}_level_{level}'
                if g_name in adata.obs.columns:
                    group_col = g_name
            if not group_col:
                raise ValueError(
                    "The provided groups or level/blocks do not exist")

            g0 = pd.Categorical(state.project_partition(0, 0).a)
            cross_tab = pd.crosstab(g0,
                                    adata.obs[group_col],
                                    normalize='index')
            ca_matrix = (p0 @ cross_tab).values

        elif type(state) == gt.PPBlockState:
            if back_prob:
                ca_matrix = get_cell_back_p(state)
            else:
                ca_matrix = get_cell_loglikelihood(state, as_prob=True)
            matrix_key = 'CA_ppbm'

    adata.obsm[matrix_key] = ca_matrix

    return adata if copy else None
Example #4
0
def get_cell_loglikelihood(state,
                           level: int = 0,
                           rescale: bool = False,
                           as_prob: bool = False):
    """
    Returns the matrix of log-likelihood differences
    when moving a cell into a different block
    
    Parameters
    ----------
    state
        A graphtool BlockState or NestedBlockState objexct
    level
        The level in NestedBlockState to consider
    rescale
        For some models, moving a cell into a different block may result in a 
        negative log-likelihood, indicating that cells may be better assigned 
        to another group. Set this parameter to `True` if you want 
        every cell to have LL=0 for the best group and avoid negative values
    as_prob
        Return values as probabilites

    Returns
    -------
    `M`
        Array of dim (n_cells, n_blocks) that stores the entropy difference
        of moving a cell into a specific group
    """

    # get the graph from state
    g = state.g

    if isinstance(state, gt.NestedBlockState):
        if level < 0 or level > len(state.get_levels()):
            # by now return the lowest level if invalid
            level = 0
        B = gt.BlockState(g, b=state.project_partition(level, 0))
    else:
        B = state

    n_cells = g.num_vertices()
    if isinstance(B, gt.BlockState):
        n_blocks = B.get_nonempty_B()
        shape = (n_cells, n_blocks)
        M = np.array([
            B.virtual_vertex_move(v, s) for v in range(n_cells)
            for s in range(n_blocks)
        ]).reshape(shape)
    elif isinstance(B, gt.PPBlockState):
        blocks = B.get_blocks().get_array()
        n_blocks = len(blocks)
        M = np.array([
            pp_virtual_vertex_move(B, v, s) for v in range(n_cells)
            for s in np.unique(blocks)
        ])

    if rescale:
        # some cells may be better in other groups, hence their LL
        # is negative when moved. Rescaling sets the minimum LL in the
        # best group
        M = M - np.min(M, axis=1)[:, None]

    if as_prob:
        E = np.exp(-M)
        return (E / np.sum(E, axis=1)[:, None])

    return M
Example #5
0
def flat_model(
    adata: AnnData,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    deg_corr: bool = True,
    multiflip: bool = True,
    fast_model: bool = False,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 100.),
    steps_anneal: int = 5,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'sbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},    
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Stochastic Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. 

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    multiflip
        Whether to perform MCMC sweep with multiple simultaneous moves to sample
        network partitions. It may result in slightly longer runtimes, but under
        the hood it allows for a more efficient space exploration.
    fast_model
        Whether to skip initial minization step and let the MCMC find a solution. 
        This approach tend to be faster and consume less memory, but 
        less accurate.
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['sbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['sbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['sbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['sbm']['state']`
        The BlockModel state object
    """

    raise DeprecationWarning("""This function has been deprecated since version 
    0.5.0, please consider usage of planted_model instead.
    """)

    if fast_model or resume: 
        # if the fast_model is chosen perform equilibration anyway
        equilibrate=True
        
    if resume and ('sbm' not in adata.uns or 'state' not in adata.uns['sbm']):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume=False
        fast_model=False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`"
            )

    start = logg.info('minimizing the Stochastic Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs=[]
    rec_types=[]
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs=[g.ep.weight]
        rec_types=['real-normal']

    if fast_model:
        # do not minimize, start with a dummy state and perform only equilibrate
        state = gt.BlockState(g=g, B=1, sampling=True,
                              state_args=dict(deg_corr=deg_corr,
                              recs=recs,
                              rec_types=rec_types
                              ))
    elif resume:
        # create the state and make sure sampling is performed
        state = adata.uns['sbm']['state'].copy(sampling=True)
        g = state.g
    else:
        if n_init < 1:
            n_init = 1
        
        states = [gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr, 
                  state_args=dict(recs=recs,  rec_types=rec_types), 
                  **minimize_args) for n in range(n_init)]
                  
        state = states[np.argmin([s.entropy() for s in states])]    

        logg.info('    done', time=start)
        state = state.copy(B=g.num_vertices())
    
    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['multiflip'] = multiflip
        equilibrate_args['mcmc_args'] = {'niter':10}
        
        dS, nattempts, nmoves = gt.mcmc_anneal(state, 
                                               mcmc_equilibrate_args=equilibrate_args,
                                               niter=steps_anneal,
                                               beta_range=beta_range)

    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = np.zeros(g.num_vertices() + 1)
        def _collect_marginals(s):
            group_marginals[s.get_nonempty_B()] += 1

        gt.mcmc_equilibrate(state, wait=wait, nbreaks=nbreaks, epsilon=epsilon,
                            max_niter=max_iterations, multiflip=False,
                            force_niter=niter_collect, mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = pd.Series(state.get_blocks().get_array()).astype('category')
    new_cat_names = dict([(cx, u'%s' % cn) for cn, cx in enumerate(groups.cat.categories)])
    groups.cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    adata.obs.loc[:, key_added] = groups

    # add some unstructured info

    adata.uns['sbm'] = {}
    adata.uns['sbm']['stats'] = dict(
    dS=dS,
    nattempts=nattempts,
    nmoves=nmoves,
    modularity=gt.modularity(g, state.get_blocks())
    )
    adata.uns['sbm']['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.uns['sbm']['group_marginals'] = group_marginals

    # calculate log-likelihood of cell moves over the remaining levels
    
    adata.uns['sbm']['cell_affinity'] = {'1':get_cell_loglikelihood(state, as_prob=True)}
    
    # last step is recording some parameters used in this analysis
    adata.uns['sbm']['params'] = dict(
        epsilon=epsilon,
        wait=wait,
        nbreaks=nbreaks,
        equilibrate=equilibrate,
        fast_model=fast_model,
        collect_marginals=collect_marginals,
        random_seed=random_seed
    )


    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_nonempty_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        ),
    )
    return adata if copy else None