def draw_lesmis(): g = gt.collection.data["lesmis"] state = gt.BlockState(g, B=20) # This automatically initializes the state # with a random partition into B=20 # nonempty groups; The user could # also pass an arbitrary initial # partition using the 'b' parameter. # If we work with the above state object, we will be restricted to # partitions into at most B=20 groups. But since we want to consider # an arbitrary number of groups in the range [1, N], we transform it # into a state with B=N groups (where N-20 will be empty). print('num v:', g.num_vertices()) state = state.copy(B=g.num_vertices()) # Now we run 1,000 sweeps of the MCMC dS, nmoves = state.mcmc_sweep(niter=1000) print("Change in description length:", dS) print("Number of accepted vertex moves:", nmoves) gt.mcmc_equilibrate(state, wait=1000, mcmc_args=dict(niter=10)) def collect_marginals(s): global pv pv = s.collect_vertex_marginals(pv) print(pv) # Now we collect the marginals for exactly 100,000 sweeps gt.mcmc_equilibrate(state, force_niter=10000, mcmc_args=dict(niter=10), callback=collect_marginals) print(g.vp.pos) # Now the node marginals are stored in property map pv. We can # visualize them as pie charts on the nodes: state.draw(pos=g.vp.pos, vertex_shape="pie", vertex_pie_fractions=pv, edge_gradient=None, output="lesmis-sbm-marginals.pdf")
def state_from_blocks( adata: AnnData, state_key: Optional[str] = 'nsbm', neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[spmatrix] = None, directed: bool = False, use_weights: bool = False, deg_corr: bool = True, ): """ Returns a gt state object given an AnnData Parameters ---------- adata The annotated data matrix. state_key The key under which the state has been saved neighbors_key The key passed to `sc.pp.neighbors` adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. Returns ------- Nothing, adds a `gt.block_state` object in adata.uns """ bl_d = adata.uns['schist'][f'{state_key}']['blocks'] params = adata.uns['schist'][f'{state_key}']['params'] if params['model'] == 'nested' or params['model'] == 'multiome_nested': blocks = [] for nl in range(len(bl_d)): blocks.append(bl_d[str(nl)]) else: blocks = bl_d['0'] if 'deg_corr' in params: deg_corr=params['deg_corr'] recs=[] rec_types=[] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs=[g.ep.weight] rec_types=['real-normal'] if 'recs' in params: recs=params['recs'] if 'rec_types' in params: rec_types=params['rec_types'] if adjacency is None: if neighbors_key not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) if params['model'] == 'flat': state = gt.BlockState(g, b=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) elif params['model'] == 'ppbm': state = gt.PPBlockState(g, b=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) else: state = gt.NestedBlockState(g, bs=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) return state
def calculate_affinity(adata: AnnData, level: int = 1, block_key: Optional[str] = 'nsbm', group_by: Optional[str] = None, state: Optional = None, neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[sparse.spmatrix] = None, directed: bool = False, use_weights: bool = False, obsp: Optional[str] = None, back_prob: bool = False, copy: bool = False) -> Optional[AnnData]: """\ Calculate cell affinity given a partition scheme. It can be used for partitions calculated using schist or for any partition scheme, given for example by cell annotations. Parameters ---------- adata: The AnnData object. Should have been already processed with schist level: The level to calculate affinity. This parameter is effective only for Nested partitions block_key: The prefix for partitions. This parameter is ignored if the state is not gt.NestedBlockState group_by: The key for group names used for calculations. Setting this will override level and block_key. This is effective only for NestedBlockState partitions state: Optionally calculate affinities on this state. neighbors_key Use neighbors connectivities as adjacency. If not specified, leiden looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, leiden looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). copy: Return a new object or do everything in place Returns ------- Depending on `copy`, returns or updates `adata` with affinity values in adata.obsm[f'CA_{block_key}_level_{level}'] """ matrix_key = f'CA_{block_key}_level_{level}' # the default name of the matrix if group_by: logg.info(f'Calculating cell affinity to {group_by}') else: logg.info(f'Calculating cell affinity to level {level}') if not state: # if no state is provided, use the default to retrieve graph if 'schist' in adata.uns and 'blocks' in adata.uns['schist'][ f'{block_key}']: params = adata.uns['schist'][f'{block_key}']['params'] if 'neighbors_key' in params: neighbors_key = params['neighbors_key'] if 'use_weights' in params: use_weights = params['use_weights'] if 'deg_corr' in params: deg_corr = params['deg_corr'] state = state_from_blocks(adata, state_key=block_key, neighbors_key=neighbors_key, adjacency=adjacency, directed=directed, use_weights=use_weights, deg_corr=deg_corr) g = state.g elif not neighbors_key: # no state and no adjacency provided, raise an error raise ValueError("A state or an adjacency matrix should be given" "Otherwise a graph cannot be computed") else: # get the graph from the adjacency adjacency = _choose_graph(adata, obsp, neighbors_key) g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) state = gt.BlockState(g) else: g = state.g if group_by: matrix_key = f'CA_{group_by}' # if groups are given, we generate a new BlockState and work on that if group_by in adata.obs.columns and adata.obs[ group_by].dtype.name == 'category': partitions = adata.obs[group_by].cat.codes.values state = gt.BlockState(g, b=partitions) if back_prob: ca_matrix = get_cell_back_p(state) else: ca_matrix = get_cell_loglikelihood(state, as_prob=True) else: raise ValueError( f"{group_by} should be a categorical entry in adata.obs") else: # use precomputed blocks and states if type(state) == gt.NestedBlockState: if back_prob: p0 = get_cell_back_p(state, level=0) else: p0 = get_cell_loglikelihood(state, level=0, as_prob=True) group_col = None if group_by and group_by in adata.obs.columns: group_col = group_by else: g_name = f'{block_key}_level_{level}' if g_name in adata.obs.columns: group_col = g_name if not group_col: raise ValueError( "The provided groups or level/blocks do not exist") g0 = pd.Categorical(state.project_partition(0, 0).a) cross_tab = pd.crosstab(g0, adata.obs[group_col], normalize='index') ca_matrix = (p0 @ cross_tab).values elif type(state) == gt.PPBlockState: if back_prob: ca_matrix = get_cell_back_p(state) else: ca_matrix = get_cell_loglikelihood(state, as_prob=True) matrix_key = 'CA_ppbm' adata.obsm[matrix_key] = ca_matrix return adata if copy else None
def get_cell_loglikelihood(state, level: int = 0, rescale: bool = False, as_prob: bool = False): """ Returns the matrix of log-likelihood differences when moving a cell into a different block Parameters ---------- state A graphtool BlockState or NestedBlockState objexct level The level in NestedBlockState to consider rescale For some models, moving a cell into a different block may result in a negative log-likelihood, indicating that cells may be better assigned to another group. Set this parameter to `True` if you want every cell to have LL=0 for the best group and avoid negative values as_prob Return values as probabilites Returns ------- `M` Array of dim (n_cells, n_blocks) that stores the entropy difference of moving a cell into a specific group """ # get the graph from state g = state.g if isinstance(state, gt.NestedBlockState): if level < 0 or level > len(state.get_levels()): # by now return the lowest level if invalid level = 0 B = gt.BlockState(g, b=state.project_partition(level, 0)) else: B = state n_cells = g.num_vertices() if isinstance(B, gt.BlockState): n_blocks = B.get_nonempty_B() shape = (n_cells, n_blocks) M = np.array([ B.virtual_vertex_move(v, s) for v in range(n_cells) for s in range(n_blocks) ]).reshape(shape) elif isinstance(B, gt.PPBlockState): blocks = B.get_blocks().get_array() n_blocks = len(blocks) M = np.array([ pp_virtual_vertex_move(B, v, s) for v in range(n_cells) for s in np.unique(blocks) ]) if rescale: # some cells may be better in other groups, hence their LL # is negative when moved. Rescaling sets the minimum LL in the # best group M = M - np.min(M, axis=1)[:, None] if as_prob: E = np.exp(-M) return (E / np.sum(E, axis=1)[:, None]) return M
def flat_model( adata: AnnData, max_iterations: int = 1000000, epsilon: float = 0, equilibrate: bool = False, wait: int = 1000, nbreaks: int = 2, collect_marginals: bool = False, niter_collect: int = 10000, deg_corr: bool = True, multiflip: bool = True, fast_model: bool = False, n_init: int = 1, beta_range: Tuple[float] = (1., 100.), steps_anneal: int = 5, resume: bool = False, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_seed: Optional[int] = None, key_added: str = 'sbm', adjacency: Optional[sparse.spmatrix] = None, neighbors_key: Optional[str] = 'neighbors', directed: bool = False, use_weights: bool = False, copy: bool = False, minimize_args: Optional[Dict] = {}, equilibrate_args: Optional[Dict] = {}, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Peixoto14]_. Cluster cells using the Stochastic Block Model [Peixoto14]_, performing Bayesian inference on node groups. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. max_iterations Maximal number of iterations to be performed by the equilibrate step. epsilon Relative changes in entropy smaller than epsilon will not be considered as record-breaking. equilibrate Whether or not perform the mcmc_equilibrate step. Equilibration should always be performed. Note, also, that without equilibration it won't be possible to collect marginals. collect_marginals Whether or not collect node probability of belonging to a specific partition. niter_collect Number of iterations to force when collecting marginals. This will increase the precision when calculating probabilites wait Number of iterations to wait for a record-breaking event. Higher values result in longer computations. Set it to small values when performing quick tests. nbreaks Number of iteration intervals (of size `wait`) without record-breaking events necessary to stop the algorithm. deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. multiflip Whether to perform MCMC sweep with multiple simultaneous moves to sample network partitions. It may result in slightly longer runtimes, but under the hood it allows for a more efficient space exploration. fast_model Whether to skip initial minization step and let the MCMC find a solution. This approach tend to be faster and consume less memory, but less accurate. n_init Number of initial minimizations to be performed. The one with smaller entropy is chosen beta_range Inverse temperature at the beginning and the end of the equilibration steps_anneal Number of steps in which the simulated annealing is performed resume Start from a previously created model, if any, without initializing a novel model key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 neighbors_key The key passed to `sc.pp.neighbors` directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times copy Whether to copy `adata` or modify it inplace. random_seed Random number to be used as seed for graph-tool Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['sbm']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. `adata.uns['sbm']['stats']` A dict with the values returned by mcmc_sweep `adata.uns['sbm']['cell_affinity']` A `np.ndarray` with cell probability of belonging to a specific group `adata.uns['sbm']['state']` The BlockModel state object """ raise DeprecationWarning("""This function has been deprecated since version 0.5.0, please consider usage of planted_model instead. """) if fast_model or resume: # if the fast_model is chosen perform equilibration anyway equilibrate=True if resume and ('sbm' not in adata.uns or 'state' not in adata.uns['sbm']): # let the model proceed as default logg.warning('Resuming has been specified but a state was not found\n' 'Will continue with default minimization step') resume=False fast_model=False if random_seed: np.random.seed(random_seed) gt.seed_rng(random_seed) if collect_marginals: logg.warning('Collecting marginals has a large impact on running time') if not equilibrate: raise ValueError( "You can't collect marginals without MCMC equilibrate " "step. Either set `equlibrate` to `True` or " "`collect_marginals` to `False`" ) start = logg.info('minimizing the Stochastic Block Model') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if neighbors_key not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = get_graph_tool_from_adjacency(adjacency, directed=directed) recs=[] rec_types=[] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs=[g.ep.weight] rec_types=['real-normal'] if fast_model: # do not minimize, start with a dummy state and perform only equilibrate state = gt.BlockState(g=g, B=1, sampling=True, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types )) elif resume: # create the state and make sure sampling is performed state = adata.uns['sbm']['state'].copy(sampling=True) g = state.g else: if n_init < 1: n_init = 1 states = [gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr, state_args=dict(recs=recs, rec_types=rec_types), **minimize_args) for n in range(n_init)] state = states[np.argmin([s.entropy() for s in states])] logg.info(' done', time=start) state = state.copy(B=g.num_vertices()) # equilibrate the Markov chain if equilibrate: logg.info('running MCMC equilibration step') equilibrate_args['wait'] = wait equilibrate_args['nbreaks'] = nbreaks equilibrate_args['max_niter'] = max_iterations equilibrate_args['multiflip'] = multiflip equilibrate_args['mcmc_args'] = {'niter':10} dS, nattempts, nmoves = gt.mcmc_anneal(state, mcmc_equilibrate_args=equilibrate_args, niter=steps_anneal, beta_range=beta_range) if collect_marginals and equilibrate: # we here only retain level_0 counts, until I can't figure out # how to propagate correctly counts to higher levels # I wonder if this should be placed after group definition or not logg.info(' collecting marginals') group_marginals = np.zeros(g.num_vertices() + 1) def _collect_marginals(s): group_marginals[s.get_nonempty_B()] += 1 gt.mcmc_equilibrate(state, wait=wait, nbreaks=nbreaks, epsilon=epsilon, max_niter=max_iterations, multiflip=False, force_niter=niter_collect, mcmc_args=dict(niter=10), callback=_collect_marginals) logg.info(' done', time=start) # everything is in place, we need to fill all slots # first build an array with groups = pd.Series(state.get_blocks().get_array()).astype('category') new_cat_names = dict([(cx, u'%s' % cn) for cn, cx in enumerate(groups.cat.categories)]) groups.cat.rename_categories(new_cat_names, inplace=True) if restrict_to is not None: groups.index = adata.obs[restrict_key].index else: groups.index = adata.obs_names # add column names adata.obs.loc[:, key_added] = groups # add some unstructured info adata.uns['sbm'] = {} adata.uns['sbm']['stats'] = dict( dS=dS, nattempts=nattempts, nmoves=nmoves, modularity=gt.modularity(g, state.get_blocks()) ) adata.uns['sbm']['state'] = state # now add marginal probabilities. if collect_marginals: # cell marginals will be a list of arrays with probabilities # of belonging to a specific group adata.uns['sbm']['group_marginals'] = group_marginals # calculate log-likelihood of cell moves over the remaining levels adata.uns['sbm']['cell_affinity'] = {'1':get_cell_loglikelihood(state, as_prob=True)} # last step is recording some parameters used in this analysis adata.uns['sbm']['params'] = dict( epsilon=epsilon, wait=wait, nbreaks=nbreaks, equilibrate=equilibrate, fast_model=fast_model, collect_marginals=collect_marginals, random_seed=random_seed ) logg.info( ' finished', time=start, deep=( f'found {state.get_nonempty_B()} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)' ), ) return adata if copy else None