Esempio n. 1
0
def rank_genes_groups(
    adata: AnnData,
    groupby: str,
    use_raw: bool = True,
    groups: Union[str, Iterable[str]] = 'all',
    reference: str = 'rest',
    n_genes: int = 100,
    rankby_abs: bool = False,
    key_added: Optional[str] = None,
    copy: bool = False,
    method: str = 't-test_overestim_var',
    corr_method: str = 'benjamini-hochberg',
    layer: Optional[str] = None,
    **kwds,
):
    """Rank genes for characterizing groups.

    Parameters
    ----------
    adata
        Annotated data matrix.
    groupby
        The key of the observations grouping to consider.
    use_raw
        Use `raw` attribute of `adata` if present.
    layer
        Key from `adata.layers` whose value will be used to perform tests on.
    groups
        Subset of groups, e.g. [`'g1'`, `'g2'`, `'g3'`], to which comparison
        shall be restricted, or `'all'` (default), for all groups.
    reference
        If `'rest'`, compare each group to the union of the rest of the group.
        If a group identifier, compare with respect to this group.
    n_genes
        The number of genes that appear in the returned tables.
    method: {`'logreg'`, `'t-test'`, `'wilcoxon'`, `'t-test_overestim_var'`}`
        The default 't-test_overestim_var' overestimates variance of each group,
        `'t-test'` uses t-test, `'wilcoxon'` uses Wilcoxon rank-sum,
        `'logreg'` uses logistic regression. See [Ntranos18]_,
        `here <https://github.com/theislab/scanpy/issues/95>`__ and `here
        <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__,
        for why this is meaningful.
    corr_method: {`'benjamini-hochberg'`, `'bonferroni'`}
        p-value correction method.
        Used only for `'t-test'`, `'t-test_overestim_var'`, and `'wilcoxon'`.
    rankby_abs
        Rank genes by the absolute value of the score, not by the
        score. The returned scores are never the absolute values.
    key_added
        The key in `adata.uns` information is saved to.
    **kwds
        Are passed to test methods. Currently this affects only parameters that
        are passed to `sklearn.linear_model.LogisticRegression
        <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__.
        For instance, you can pass `penalty='l1'` to try to come up with a
        minimal set of genes that are good predictors (sparse solution meaning
        few non-zero fitted coefficients).

    Returns
    -------
    **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the gene
        names. Ordered according to scores.
    **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the z-score
        underlying the computation of a p-value for each gene for each
        group. Ordered according to scores.
    **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the log2
        fold change for each gene for each group. Ordered according to
        scores. Only provided if method is 't-test' like.
        Note: this is an approximation calculated from mean-log values.
    **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        p-values.
    **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Corrected p-values.

    Notes
    -----
    There are slight inconsistencies depending on whether sparse
    or dense data are passed. See `here <https://github.com/theislab/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__.

    Examples
    --------
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon')

    # to visualize the results
    >>> sc.pl.rank_genes_groups(adata)
    """
    if 'only_positive' in kwds:
        rankby_abs = not kwds.pop('only_positive')  # backwards compat

    start = logg.info('ranking genes')
    avail_methods = {'t-test', 't-test_overestim_var', 'wilcoxon', 'logreg'}
    if method not in avail_methods:
        raise ValueError('Method must be one of {}.'.format(avail_methods))

    avail_corr = {'benjamini-hochberg', 'bonferroni'}
    if corr_method not in avail_corr:
        raise ValueError(
            'Correction method must be one of {}.'.format(avail_corr))

    adata = adata.copy() if copy else adata
    utils.sanitize_anndata(adata)
    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if (reference != 'rest'
            and reference not in set(adata.obs[groupby].cat.categories)):
        cats = adata.obs[groupby].cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    groups_order, groups_masks = utils.select_groups(adata, groups_order,
                                                     groupby)

    if key_added is None:
        key_added = 'rank_genes_groups'
    adata.uns[key_added] = {}
    adata.uns[key_added]['params'] = {
        'groupby': groupby,
        'reference': reference,
        'method': method,
        'use_raw': use_raw,
        'layer': layer,
        'corr_method': corr_method,
    }

    # adata_comp mocks an AnnData object if use_raw is True
    # otherwise it's just the AnnData object
    adata_comp = adata
    if layer is not None:
        if use_raw:
            raise ValueError("Cannot specify `layer` and have `use_raw=True`.")
        X = adata_comp.layers[layer]
    else:
        if use_raw and adata.raw is not None:
            adata_comp = adata.raw
        X = adata_comp.X

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    if n_genes_user > X.shape[1]:
        n_genes_user = X.shape[1]
    # in the following, n_genes is simply another name for the total number of genes
    n_genes = X.shape[1]

    n_groups = groups_masks.shape[0]
    ns = np.zeros(n_groups, dtype=int)
    for imask, mask in enumerate(groups_masks):
        ns[imask] = np.where(mask)[0].size
    logg.debug(f'consider {groupby!r} groups:')
    logg.debug(f'with sizes: {ns}')
    if reference != 'rest':
        ireference = np.where(groups_order == reference)[0][0]
    reference_indices = np.arange(adata_comp.n_vars, dtype=int)

    rankings_gene_scores = []
    rankings_gene_names = []
    rankings_gene_logfoldchanges = []
    rankings_gene_pvals = []
    rankings_gene_pvals_adj = []

    if method in {'t-test', 't-test_overestim_var'}:
        from scipy import stats
        from statsmodels.stats.multitest import multipletests
        # loop over all masks and compute means, variances and sample numbers
        means = np.zeros((n_groups, n_genes))
        vars = np.zeros((n_groups, n_genes))

        for imask, mask in enumerate(groups_masks):
            means[imask], vars[imask] = _get_mean_var(X[mask])

        # test each either against the union of all other groups or against a
        # specific group
        for igroup in range(n_groups):
            if reference == 'rest':
                mask_rest = ~groups_masks[igroup]
            else:
                if igroup == ireference: continue
                else: mask_rest = groups_masks[ireference]
            mean_group, var_group = means[igroup], vars[igroup]
            mean_rest, var_rest = _get_mean_var(X[mask_rest])

            ns_group = ns[igroup]  # number of observations in group
            if method == 't-test': ns_rest = np.where(mask_rest)[0].size
            elif method == 't-test_overestim_var':
                ns_rest = ns[
                    igroup]  # hack for overestimating the variance for small groups
            else:
                raise ValueError('Method does not exist.')

            # TODO: Come up with better solution. Mask unexpressed genes?
            # See https://github.com/scipy/scipy/issues/10269
            with np.errstate(invalid="ignore"):
                scores, pvals = stats.ttest_ind_from_stats(
                    mean1=mean_group,
                    std1=np.sqrt(var_group),
                    nobs1=ns_group,
                    mean2=mean_rest,
                    std2=np.sqrt(var_rest),
                    nobs2=ns_rest,
                    equal_var=False  # Welch's
                )

            # Fold change
            foldchanges = (np.expm1(mean_group) + 1e-9) / (
                np.expm1(mean_rest) + 1e-9)  # add small value to remove 0's

            scores[np.isnan(
                scores
            )] = 0  # I think it's only nan when means are the same and vars are 0
            pvals[np.isnan(
                pvals)] = 1  # This also has to happen for Benjamini Hochberg

            if corr_method == 'benjamini-hochberg':
                _, pvals_adj, _, _ = multipletests(pvals,
                                                   alpha=0.05,
                                                   method='fdr_bh')
            elif corr_method == 'bonferroni':
                pvals_adj = np.minimum(pvals * n_genes, 1.0)

            scores_sort = np.abs(scores) if rankby_abs else scores
            partition = np.argpartition(scores_sort,
                                        -n_genes_user)[-n_genes_user:]
            partial_indices = np.argsort(scores_sort[partition])[::-1]
            global_indices = reference_indices[partition][partial_indices]
            rankings_gene_scores.append(scores[global_indices])
            rankings_gene_logfoldchanges.append(
                np.log2(foldchanges[global_indices]))
            rankings_gene_names.append(adata_comp.var_names[global_indices])
            rankings_gene_pvals.append(pvals[global_indices])
            rankings_gene_pvals_adj.append(pvals_adj[global_indices])

    elif method == 'logreg':
        # if reference is not set, then the groups listed will be compared to the rest
        # if reference is set, then the groups listed will be compared only to the other groups listed
        from sklearn.linear_model import LogisticRegression
        reference = groups_order[0]
        if len(groups) == 1:
            raise Exception(
                'Cannot perform logistic regression on a single cluster.')

        grouping_mask = adata.obs[groupby].isin(groups_order)
        grouping = adata.obs.loc[grouping_mask, groupby]
        X = X[
            grouping_mask.
            values, :]  # Indexing with a series causes issues, possibly segfault

        clf = LogisticRegression(**kwds)
        clf.fit(X, grouping.cat.codes)
        scores_all = clf.coef_
        for igroup, group in enumerate(groups_order):
            if len(groups_order) <= 2:  # binary logistic regression
                scores = scores_all[0]
            else:
                scores = scores_all[igroup]
            partition = np.argpartition(scores, -n_genes_user)[-n_genes_user:]
            partial_indices = np.argsort(scores[partition])[::-1]
            global_indices = reference_indices[partition][partial_indices]
            rankings_gene_scores.append(scores[global_indices])
            rankings_gene_names.append(adata_comp.var_names[global_indices])
            if len(groups_order) <= 2:
                break

    elif method == 'wilcoxon':
        from scipy import stats
        from statsmodels.stats.multitest import multipletests
        CONST_MAX_SIZE = 10000000
        means = np.zeros((n_groups, n_genes))
        vars = np.zeros((n_groups, n_genes))
        # initialize space for z-scores
        scores = np.zeros(n_genes)
        # First loop: Loop over all genes
        if reference != 'rest':
            for imask, mask in enumerate(groups_masks):
                means[imask], vars[imask] = _get_mean_var(
                    X[mask])  # for fold-change only

                if imask == ireference: continue

                else: mask_rest = groups_masks[ireference]
                ns_rest = np.where(mask_rest)[0].size
                mean_rest, var_rest = _get_mean_var(
                    X[mask_rest])  # for fold-change only

                if ns_rest <= 25 or ns[imask] <= 25:
                    logg.hint(
                        'Few observations in a group for '
                        'normal approximation (<=25). Lower test accuracy.')
                n_active = ns[imask]
                m_active = ns_rest

                # Now calculate gene expression ranking in chunkes:
                chunk = []
                # Calculate chunk frames
                n_genes_max_chunk = floor(CONST_MAX_SIZE /
                                          (n_active + m_active))
                if n_genes_max_chunk < n_genes:
                    chunk_index = n_genes_max_chunk
                    while chunk_index < n_genes:
                        chunk.append(chunk_index)
                        chunk_index = chunk_index + n_genes_max_chunk
                    chunk.append(n_genes)
                else:
                    chunk.append(n_genes)

                left = 0
                # Calculate rank sums for each chunk for the current mask
                for chunk_index, right in enumerate(chunk):
                    # Check if issparse is true: AnnData objects are currently sparse.csr or ndarray.
                    if issparse(X):
                        df1 = pd.DataFrame(data=X[mask, left:right].todense())
                        df2 = pd.DataFrame(
                            data=X[mask_rest, left:right].todense(),
                            index=np.arange(start=n_active,
                                            stop=n_active + m_active))
                    else:
                        df1 = pd.DataFrame(data=X[mask, left:right])
                        df2 = pd.DataFrame(data=X[mask_rest, left:right],
                                           index=np.arange(start=n_active,
                                                           stop=n_active +
                                                           m_active))
                    df1 = df1.append(df2)
                    ranks = df1.rank()
                    # sum up adjusted_ranks to calculate W_m,n
                    scores[left:right] = np.sum(ranks.loc[0:n_active, :])
                    left = right

                scores = (scores - (n_active *
                                    (n_active + m_active + 1) / 2)) / sqrt(
                                        (n_active * m_active *
                                         (n_active + m_active + 1) / 12))
                scores[np.isnan(scores)] = 0
                pvals = 2 * stats.distributions.norm.sf(np.abs(scores))

                if corr_method == 'benjamini-hochberg':
                    pvals[np.isnan(
                        pvals
                    )] = 1  # set Nan values to 1 to properly convert using Benhjamini Hochberg
                    _, pvals_adj, _, _ = multipletests(pvals,
                                                       alpha=0.05,
                                                       method='fdr_bh')
                elif corr_method == 'bonferroni':
                    pvals_adj = np.minimum(pvals * n_genes, 1.0)

                # Fold change
                foldchanges = (np.expm1(means[imask]) + 1e-9) / (
                    np.expm1(mean_rest) + 1e-9
                )  # add small value to remove 0's
                scores_sort = np.abs(scores) if rankby_abs else scores
                partition = np.argpartition(scores_sort,
                                            -n_genes_user)[-n_genes_user:]
                partial_indices = np.argsort(scores_sort[partition])[::-1]
                global_indices = reference_indices[partition][partial_indices]
                rankings_gene_scores.append(scores[global_indices])
                rankings_gene_names.append(
                    adata_comp.var_names[global_indices])
                rankings_gene_logfoldchanges.append(
                    np.log2(foldchanges[global_indices]))
                rankings_gene_pvals.append(pvals[global_indices])
                rankings_gene_pvals_adj.append(pvals_adj[global_indices])

        # If no reference group exists, ranking needs only to be done once (full mask)
        else:
            scores = np.zeros((n_groups, n_genes))
            chunk = []
            n_cells = X.shape[0]
            n_genes_max_chunk = floor(CONST_MAX_SIZE / n_cells)
            if n_genes_max_chunk < n_genes:
                chunk_index = n_genes_max_chunk
                while chunk_index < n_genes:
                    chunk.append(chunk_index)
                    chunk_index = chunk_index + n_genes_max_chunk
                chunk.append(n_genes)
            else:
                chunk.append(n_genes)
            left = 0
            for chunk_index, right in enumerate(chunk):
                # Check if issparse is true
                if issparse(X):
                    df1 = pd.DataFrame(data=X[:, left:right].todense())
                else:
                    df1 = pd.DataFrame(data=X[:, left:right])
                ranks = df1.rank()
                # sum up adjusted_ranks to calculate W_m,n
                for imask, mask in enumerate(groups_masks):
                    scores[imask, left:right] = np.sum(ranks.loc[mask, :])
                left = right

            for imask, mask in enumerate(groups_masks):
                mask_rest = ~groups_masks[imask]
                means[imask], vars[imask] = _get_mean_var(
                    X[mask])  #for fold-change
                mean_rest, var_rest = _get_mean_var(
                    X[mask_rest])  # for fold-change

                scores[imask, :] = (scores[imask, :] -
                                    (ns[imask] * (n_cells + 1) / 2)) / sqrt(
                                        (ns[imask] * (n_cells - ns[imask]) *
                                         (n_cells + 1) / 12))
                scores[np.isnan(scores)] = 0
                pvals = 2 * stats.distributions.norm.sf(
                    np.abs(scores[imask, :]))

                if corr_method == 'benjamini-hochberg':
                    pvals[np.isnan(
                        pvals
                    )] = 1  # set Nan values to 1 to properly convert using Benhjamini Hochberg
                    _, pvals_adj, _, _ = multipletests(pvals,
                                                       alpha=0.05,
                                                       method='fdr_bh')
                elif corr_method == 'bonferroni':
                    pvals_adj = np.minimum(pvals * n_genes, 1.0)

                # Fold change
                foldchanges = (np.expm1(means[imask]) + 1e-9) / (
                    np.expm1(mean_rest) + 1e-9
                )  # add small value to remove 0's
                scores_sort = np.abs(scores) if rankby_abs else scores
                partition = np.argpartition(scores_sort[imask, :],
                                            -n_genes_user)[-n_genes_user:]
                partial_indices = np.argsort(scores_sort[imask,
                                                         partition])[::-1]
                global_indices = reference_indices[partition][partial_indices]
                rankings_gene_scores.append(scores[imask, global_indices])
                rankings_gene_names.append(
                    adata_comp.var_names[global_indices])
                rankings_gene_logfoldchanges.append(
                    np.log2(foldchanges[global_indices]))
                rankings_gene_pvals.append(pvals[global_indices])
                rankings_gene_pvals_adj.append(pvals_adj[global_indices])

    groups_order_save = [str(g) for g in groups_order]
    if (reference != 'rest' and method != 'logreg') or (method == 'logreg'
                                                        and len(groups) == 2):
        groups_order_save = [g for g in groups_order if g != reference]
    adata.uns[key_added]['scores'] = np.rec.fromarrays(
        [n for n in rankings_gene_scores],
        dtype=[(rn, 'float32') for rn in groups_order_save])
    adata.uns[key_added]['names'] = np.rec.fromarrays(
        [n for n in rankings_gene_names],
        dtype=[(rn, 'U50') for rn in groups_order_save])

    if method in {'t-test', 't-test_overestim_var', 'wilcoxon'}:
        adata.uns[key_added]['logfoldchanges'] = np.rec.fromarrays(
            [n for n in rankings_gene_logfoldchanges],
            dtype=[(rn, 'float32') for rn in groups_order_save])
        adata.uns[key_added]['pvals'] = np.rec.fromarrays(
            [n for n in rankings_gene_pvals],
            dtype=[(rn, 'float64') for rn in groups_order_save])
        adata.uns[key_added]['pvals_adj'] = np.rec.fromarrays(
            [n for n in rankings_gene_pvals_adj],
            dtype=[(rn, 'float64') for rn in groups_order_save])
    logg.info(
        '    finished',
        time=start,
        deep=
        (f'added to `.uns[{key_added!r}]`\n'
         "    'names', sorted np.recarray to be indexed by group ids\n"
         "    'scores', sorted np.recarray to be indexed by group ids\n" +
         ("    'logfoldchanges', sorted np.recarray to be indexed by group ids\n"
          "    'pvals', sorted np.recarray to be indexed by group ids\n"
          "    'pvals_adj', sorted np.recarray to be indexed by group ids" if
          method in {'t-test', 't-test_overestim_var', 'wilcoxon'} else '')),
    )
    return adata if copy else None
Esempio n. 2
0
def recipe_zheng17(
    adata: AnnData,
    n_top_genes: int = 1000,
    log: bool = True,
    plot: bool = False,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Normalization and filtering as of [Zheng17]_.

    Reproduces the preprocessing of [Zheng17]_ – the Cell Ranger R Kit of 10x
    Genomics.

    Expects non-logarithmized data.
    If using logarithmized data, pass `log=False`.

    The recipe runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=1)         # only consider genes with more than 1 count
        sc.pp.normalize_per_cell(                       # normalize with total UMI count per cell
             adata, key_n_counts='n_counts_all'
        )
        filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
            adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False
        )
        adata = adata[:, filter_result.gene_subset]     # subset the genes
        sc.pp.normalize_per_cell(adata)                 # renormalize after filtering
        if log: sc.pp.log1p(adata)                      # log transform: adata.X = log(adata.X + 1)
        sc.pp.scale(adata)                              # scale to unit variance and shift to zero mean


    Parameters
    ----------
    adata
        Annotated data matrix.
    n_top_genes
        Number of genes to keep.
    log
        Take logarithm.
    plot
        Show a plot of the gene dispersion vs. mean relation.
    copy
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    start = logg.info('running recipe zheng17')
    if copy: adata = adata.copy()
    # only consider genes with more than 1 count
    pp.filter_genes(adata, min_counts=1)
    # normalize with total UMI count per cell
    normalize_total(adata, key_added='n_counts_all')
    filter_result = filter_genes_dispersion(
        adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False
    )
    if plot:  # should not import at the top of the file
        from ..plotting import _preprocessing as ppp
        ppp.filter_genes_dispersion(filter_result, log=True)
    # actually filter the genes, the following is the inplace version of
    #     adata = adata[:, filter_result.gene_subset]
    adata._inplace_subset_var(filter_result.gene_subset)  # filter genes
    normalize_total(adata)  # renormalize after filtering
    if log: pp.log1p(adata)  # log transform: X = log(X + 1)
    pp.scale(adata)
    logg.info('    finished', time=start)
    return adata if copy else None
Esempio n. 3
0
def regress_out(
    adata: AnnData,
    keys: Union[str, Sequence[str]],
    n_jobs: Optional[int] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Regress out (mostly) unwanted sources of variation.

    Uses simple linear regression. This is inspired by Seurat's `regressOut`
    function in R [Satija15]. Note that this function tends to overcorrect
    in certain circumstances as described in :issue:`526`.

    Parameters
    ----------
    adata
        The annotated data matrix.
    keys
        Keys for observation annotation on which to regress on.
    n_jobs
        Number of jobs for parallel computation.
        `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`.
    copy
        Determines whether a copy of `adata` is returned.

    Returns
    -------
    Depending on `copy` returns or updates `adata` with the corrected data matrix.
    """
    start = logg.info(f'regressing out {keys}')
    if issparse(adata.X):
        logg.info('    sparse input is densified and may '
                  'lead to high memory use')
    adata = adata.copy() if copy else adata

    sanitize_anndata(adata)

    if isinstance(keys, str):
        keys = [keys]

    if issparse(adata.X):
        adata.X = adata.X.toarray()

    n_jobs = sett.n_jobs if n_jobs is None else n_jobs

    # regress on a single categorical variable
    variable_is_categorical = False
    if keys[0] in adata.obs_keys() and is_categorical_dtype(
            adata.obs[keys[0]]):
        if len(keys) > 1:
            raise ValueError('If providing categorical variable, '
                             'only a single one is allowed. For this one '
                             'we regress on the mean for each category.')
        logg.debug('... regressing on per-gene means within categories')
        regressors = np.zeros(adata.X.shape, dtype='float32')
        for category in adata.obs[keys[0]].cat.categories:
            mask = (category == adata.obs[keys[0]]).values
            for ix, x in enumerate(adata.X.T):
                regressors[mask, ix] = x[mask].mean()
        variable_is_categorical = True
    # regress on one or several ordinal variables
    else:
        # create data frame with selected keys (if given)
        if keys:
            regressors = adata.obs[keys]
        else:
            regressors = adata.obs.copy()

        # add column of ones at index 0 (first column)
        regressors.insert(0, 'ones', 1.0)

    len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int)
    n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)

    tasks = []
    # split the adata.X matrix by columns in chunks of size n_chunk
    # (the last chunk could be of smaller size than the others)
    chunk_list = np.array_split(adata.X, n_chunks, axis=1)
    if variable_is_categorical:
        regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
    for idx, data_chunk in enumerate(chunk_list):
        # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and
        # the regressors. This data will be passed to each of the jobs.
        if variable_is_categorical:
            regres = regressors_chunk[idx]
        else:
            regres = regressors
        tasks.append(tuple((data_chunk, regres, variable_is_categorical)))

    if n_jobs > 1 and n_chunks > 1:
        import multiprocessing
        pool = multiprocessing.Pool(n_jobs)
        res = pool.map_async(_regress_out_chunk, tasks).get(9999999)
        pool.close()

    else:
        res = list(map(_regress_out_chunk, tasks))

    # res is a list of vectors (each corresponding to a regressed gene column).
    # The transpose is needed to get the matrix in the shape needed
    adata.X = np.vstack(res).T.astype(adata.X.dtype)
    logg.info('    finished', time=start)
    return adata if copy else None
Esempio n. 4
0
def filter_genes_dispersion(
    data: AnnData,
    flavor: Literal['seurat', 'cell_ranger'] = 'seurat',
    min_disp: Optional[float] = None,
    max_disp: Optional[float] = None,
    min_mean: Optional[float] = None,
    max_mean: Optional[float] = None,
    n_bins: int = 20,
    n_top_genes: Optional[int] = None,
    log: bool = True,
    subset: bool = True,
    copy: bool = False,
):
    """\
    Extract highly variable genes [Satija15]_ [Zheng17]_.

    .. warning::
        .. deprecated:: 1.3.6
            Use :func:`~scanpy.pp.highly_variable_genes`
            instead. The new function is equivalent to the present
            function, except that

            * the new function always expects logarithmized data
            * `subset=False` in the new function, it suffices to
              merely annotate the genes, tools like `pp.pca` will
              detect the annotation
            * you can now call: `sc.pl.highly_variable_genes(adata)`
            * `copy` is replaced by `inplace`

    If trying out parameters, pass the data matrix instead of AnnData.

    Depending on `flavor`, this reproduces the R-implementations of Seurat
    [Satija15]_ and Cell Ranger [Zheng17]_.

    The normalized dispersion is obtained by scaling with the mean and standard
    deviation of the dispersions for genes falling into a given bin for mean
    expression of genes. This means that for each bin of mean expression, highly
    variable genes are selected.

    Use `flavor='cell_ranger'` with care and in the same way as in
    :func:`~scanpy.pp.recipe_zheng17`.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    flavor
        Choose the flavor for computing normalized dispersion. If choosing
        'seurat', this expects non-logarithmized data – the logarithm of mean
        and dispersion is taken internally when `log` is at its default value
        `True`. For 'cell_ranger', this is usually called for logarithmized data
        – in this case you should set `log` to `False`. In their default
        workflows, Seurat passes the cutoffs whereas Cell Ranger passes
        `n_top_genes`.
    min_mean
    max_mean
    min_disp
    max_disp
        If `n_top_genes` unequals `None`, these cutoffs for the means and the
        normalized dispersions are ignored.
    n_bins
        Number of bins for binning the mean gene expression. Normalization is
        done with respect to each bin. If just a single gene falls into a bin,
        the normalized dispersion is artificially set to 1. You'll be informed
        about this if you set `settings.verbosity = 4`.
    n_top_genes
        Number of highly-variable genes to keep.
    log
        Use the logarithm of the mean to variance ratio.
    subset
        Keep highly-variable genes only (if True) else write a bool array for h
        ighly-variable genes while keeping all genes
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    If an AnnData `adata` is passed, returns or updates `adata` depending on
    `copy`. It filters the `adata` and adds the annotations

    **means** : adata.var
        Means per gene. Logarithmized when `log` is `True`.
    **dispersions** : adata.var
        Dispersions per gene. Logarithmized when `log` is `True`.
    **dispersions_norm** : adata.var
        Normalized dispersions per gene. Logarithmized when `log` is `True`.

    If a data matrix `X` is passed, the annotation is returned as `np.recarray`
    with the same information stored in fields: `gene_subset`, `means`, `dispersions`, `dispersion_norm`.
    """
    if (n_top_genes is not None
            and not all(x is None
                        for x in [min_disp, max_disp, min_mean, max_mean])):
        logg.info('If you pass `n_top_genes`, all cutoffs are ignored.')
    if min_disp is None: min_disp = 0.5
    if min_mean is None: min_mean = 0.0125
    if max_mean is None: max_mean = 3
    if isinstance(data, AnnData):
        adata = data.copy() if copy else data
        result = filter_genes_dispersion(
            adata.X,
            log=log,
            min_disp=min_disp,
            max_disp=max_disp,
            min_mean=min_mean,
            max_mean=max_mean,
            n_top_genes=n_top_genes,
            flavor=flavor,
        )
        adata.var['means'] = result['means']
        adata.var['dispersions'] = result['dispersions']
        adata.var['dispersions_norm'] = result['dispersions_norm']
        if subset:
            adata._inplace_subset_var(result['gene_subset'])
        else:
            adata.var['highly_variable'] = result['gene_subset']
        return adata if copy else None
    start = logg.info('extracting highly variable genes')
    X = data  # no copy necessary, X remains unchanged in the following
    mean, var = materialize_as_ndarray(_get_mean_var(X))
    # now actually compute the dispersion
    mean[mean == 0] = 1e-12  # set entries equal to zero to small value
    dispersion = var / mean
    if log:  # logarithmized mean as in Seurat
        dispersion[dispersion == 0] = np.nan
        dispersion = np.log(dispersion)
        mean = np.log1p(mean)
    # all of the following quantities are "per-gene" here
    df = pd.DataFrame()
    df['mean'] = mean
    df['dispersion'] = dispersion
    if flavor == 'seurat':
        df['mean_bin'] = pd.cut(df['mean'], bins=n_bins)
        disp_grouped = df.groupby('mean_bin')['dispersion']
        disp_mean_bin = disp_grouped.mean()
        disp_std_bin = disp_grouped.std(ddof=1)
        # retrieve those genes that have nan std, these are the ones where
        # only a single gene fell in the bin and implicitly set them to have
        # a normalized disperion of 1
        one_gene_per_bin = disp_std_bin.isnull()
        gen_indices = np.where(
            one_gene_per_bin[df['mean_bin'].values])[0].tolist()
        if len(gen_indices) > 0:
            logg.debug(
                f'Gene indices {gen_indices} fell into a single bin: their '
                'normalized dispersion was set to 1.\n    '
                'Decreasing `n_bins` will likely avoid this effect.')
        # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32,
        # but there’s still a dtype error without “.value”.
        disp_std_bin[one_gene_per_bin] = disp_mean_bin[
            one_gene_per_bin.values].values
        disp_mean_bin[one_gene_per_bin] = 0
        # actually do the normalization
        df['dispersion_norm'] = (
            df['dispersion'].values  # use values here as index differs
            - disp_mean_bin[df['mean_bin'].values].values
        ) / disp_std_bin[df['mean_bin'].values].values
    elif flavor == 'cell_ranger':
        from statsmodels import robust
        df['mean_bin'] = pd.cut(
            df['mean'], np.r_[-np.inf,
                              np.percentile(df['mean'], np.arange(10, 105, 5)),
                              np.inf])
        disp_grouped = df.groupby('mean_bin')['dispersion']
        disp_median_bin = disp_grouped.median()
        # the next line raises the warning: "Mean of empty slice"
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            disp_mad_bin = disp_grouped.apply(robust.mad)
        df['dispersion_norm'] = np.abs(
            df['dispersion'].values -
            disp_median_bin[df['mean_bin'].values].values) / disp_mad_bin[
                df['mean_bin'].values].values
    else:
        raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"')
    dispersion_norm = df['dispersion_norm'].values.astype('float32')
    if n_top_genes is not None:
        dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)]
        dispersion_norm[::-1].sort(
        )  # interestingly, np.argpartition is slightly slower
        disp_cut_off = dispersion_norm[n_top_genes - 1]
        gene_subset = df['dispersion_norm'].values >= disp_cut_off
        logg.debug(f'the {n_top_genes} top genes correspond to a '
                   f'normalized dispersion cutoff of {disp_cut_off}')
    else:
        max_disp = np.inf if max_disp is None else max_disp
        dispersion_norm[np.isnan(dispersion_norm)] = 0  # similar to Seurat
        gene_subset = np.logical_and.reduce((
            mean > min_mean,
            mean < max_mean,
            dispersion_norm > min_disp,
            dispersion_norm < max_disp,
        ))
    logg.info('    finished', time=start)
    return np.rec.fromarrays((
        gene_subset,
        df['mean'].values,
        df['dispersion'].values,
        df['dispersion_norm'].values.astype('float32', copy=False),
    ),
                             dtype=[
                                 ('gene_subset', bool),
                                 ('means', 'float32'),
                                 ('dispersions', 'float32'),
                                 ('dispersions_norm', 'float32'),
                             ])
Esempio n. 5
0
def leiden(
    adata: AnnData,
    resolution: float = 1,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_state: Optional[Union[int, RandomState]] = 0,
    key_added: str = 'leiden',
    adjacency: Optional[sparse.spmatrix] = None,
    directed: bool = True,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    copy: bool = False,
    **partition_kwargs,
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Traag18]_.

    Cluster cells using the Leiden algorithm [Traag18]_,
    an improved version of the Louvain algorithm [Blondel08]_.
    It has been proposed for single-cell analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        A parameter value controlling the coarseness of the clustering.
        Higher values lead to more clusters.
        Set to `None` if overriding `partition_type`
        to one that doesn’t accept a `resolution_parameter`.
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain `(obs_key, list_of_categories)`.
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']`.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    n_iterations
        How many iterations of the Leiden clustering algorithm to perform.
        Positive values above 2 define the total number of iterations to perform,
        -1 has the algorithm run until it reaches its optimal clustering.
    partition_type
        Type of partition to use.
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`.
        For the available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    copy
        Whether to copy `adata` or modify it inplace.
    **partition_kwargs
        Any further arguments to pass to `~leidenalg.find_partition`
        (which in turn passes arguments to the `partition_type`).

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['leiden']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    """
    try:
        import leidenalg
    except ImportError:
        raise ImportError(
            'Please install the leiden algorithm: `pip3 install leidenalg`.')
    partition_kwargs = dict(partition_kwargs)

    start = logg.info('running Leiden clustering')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if 'neighbors' not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = _utils.get_igraph_from_adjacency(adjacency, directed=directed)
    # flip to the default partition type if not overriden by the user
    if partition_type is None:
        partition_type = leidenalg.RBConfigurationVertexPartition
    # Prepare find_partition arguments as a dictionary,
    # appending to whatever the user provided. It needs to be this way
    # as this allows for the accounting of a None resolution
    # (in the case of a partition variant that doesn't take it on input)
    if use_weights:
        partition_kwargs['weights'] = np.array(g.es['weight']).astype(
            np.float64)
    partition_kwargs['n_iterations'] = n_iterations
    partition_kwargs['seed'] = random_state
    if resolution is not None:
        partition_kwargs['resolution_parameter'] = resolution
    # clustering proper
    part = leidenalg.find_partition(g, partition_type, **partition_kwargs)
    # store output into adata.obs
    groups = np.array(part.membership)
    if restrict_to is not None:
        if key_added == 'louvain':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(np.unique(groups).astype('U')),
    )
    # store information on the clustering parameters
    adata.uns['leiden'] = {}
    adata.uns['leiden']['params'] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
    )
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Esempio n. 6
0
def lineages(
    adata: AnnData,
    lineages: Optional[Union[str, Iterable[str]]] = None,
    final: bool = True,
    cluster_key: Optional[str] = None,
    mode: str = "embedding",
    time_key: str = "latent_time",
    cmap: Union[str, mpl.colors.ListedColormap] = cm.viridis,
    **kwargs,
) -> None:
    """
    Plot lineages that were uncovered using :func:`cellrank.tl.lineages`.

    For each lineage, we show all cells in an embedding (default is UMAP but can be any) and color them by their
    probability of belonging to this lineage. For cells that are already committed, this probability will be one for
    their respective lineage and zero otherwise. For naive cells, these probabilities will be more balanced, reflecting
    the fact that naive cells have the potential to develop towards multiple endpoints.

    .. image:: https://raw.githubusercontent.com/theislab/cellrank/master/resources/images/lineages.png
       :width: 400px
       :align: center

    Params
    ------

    adata : :class:`adata.AnnData`
        Annotated data object.
    lineages
        Only show these lineages. If `None`, plot all lineages.
    final
        Whether to consider cells going to final states or vice versa.
    cluster_key
        If given, plot cluster annotations left of the lineage probabilities.
    mode
        Can be either `'embedding'` or `'time'`.

        - If `'embedding'`, plots the embedding while coloring in the absorption probabilities.
        - If `'time'`, plots the pseudotime on x-axis and the absorption probabilities on y-axis.
    time_key
        Key from `adata.obs` to use as a pseudotime ordering of the cells.
    cmap
        Colormap to use.
    kwargs
        Keyword arguments for :func:`scvelo.pl.scatter`.

    Returns
    -------
    None
        Just plots the lineage probabilities.
    """

    adata_dummy = adata.copy()

    # create a dummy kernel object
    vk = VelocityKernel(adata_dummy, backward=not final)
    vk.transition_matrix = csr_matrix((adata_dummy.n_obs, adata_dummy.n_obs))

    # use this to initialize an MC object
    mc = MarkovChain(vk)

    # plot using the MC object
    mc.plot_lin_probs(
        lineages=lineages,
        cluster_key=cluster_key,
        mode=mode,
        time_key=time_key,
        cmap=cmap,
        **kwargs,
    )
Esempio n. 7
0
def score_genes(
    adata: AnnData,
    gene_list: Sequence[str],
    ctrl_size: int = 50,
    gene_pool: Optional[Sequence[str]] = None,
    n_bins: int = 25,
    score_name: str = 'score',
    random_state: AnyRandom = 0,
    copy: bool = False,
    use_raw: Optional[bool] = None,
) -> Optional[AnnData]:
    """\
    Score a set of genes [Satija15]_.

    The score is the average expression of a set of genes subtracted with the
    average expression of a reference set of genes. The reference set is
    randomly sampled from the `gene_pool` for each binned expression value.

    This reproduces the approach in Seurat [Satija15]_ and has been implemented
    for Scanpy by Davide Cittaro.

    Parameters
    ----------
    adata
        The annotated data matrix.
    gene_list
        The list of gene names used for score calculation.
    ctrl_size
        Number of reference genes to be sampled from each bin. If `len(gene_list)` is not too
        low, you can set `ctrl_size=len(gene_list)`.
    gene_pool
        Genes for sampling the reference set. Default is all genes.
    n_bins
        Number of expression level bins for sampling.
    score_name
        Name of the field to be added in `.obs`.
    random_state
        The random seed for sampling.
    copy
        Copy `adata` or modify it inplace.
    use_raw
        Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present.

        .. versionchanged:: 1.4.5
           Default value changed from `False` to `None`.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with an additional field
    `score_name`.

    Examples
    --------
    See this `notebook <https://github.com/scverse/scanpy_usage/tree/master/180209_cell_cycle>`__.
    """
    start = logg.info(f'computing score {score_name!r}')
    adata = adata.copy() if copy else adata
    use_raw = _check_use_raw(adata, use_raw)

    if random_state is not None:
        np.random.seed(random_state)

    gene_list_in_var = []
    var_names = adata.raw.var_names if use_raw else adata.var_names
    genes_to_ignore = []
    for gene in gene_list:
        if gene in var_names:
            gene_list_in_var.append(gene)
        else:
            genes_to_ignore.append(gene)
    if len(genes_to_ignore) > 0:
        logg.warning(f'genes are not in var_names and ignored: {genes_to_ignore}')
    gene_list = set(gene_list_in_var[:])

    if len(gene_list) == 0:
        raise ValueError("No valid genes were passed for scoring.")

    if gene_pool is None:
        gene_pool = list(var_names)
    else:
        gene_pool = [x for x in gene_pool if x in var_names]
    if not gene_pool:
        raise ValueError("No valid genes were passed for reference set.")

    # Trying here to match the Seurat approach in scoring cells.
    # Basically we need to compare genes against random genes in a matched
    # interval of expression.

    _adata = adata.raw if use_raw else adata
    _adata_subset = (
        _adata[:, gene_pool] if len(gene_pool) < len(_adata.var_names) else _adata
    )
    if issparse(_adata_subset.X):
        obs_avg = pd.Series(
            np.array(_sparse_nanmean(_adata_subset.X, axis=0)).flatten(),
            index=gene_pool,
        )  # average expression of genes
    else:
        obs_avg = pd.Series(
            np.nanmean(_adata_subset.X, axis=0), index=gene_pool
        )  # average expression of genes

    obs_avg = obs_avg[
        np.isfinite(obs_avg)
    ]  # Sometimes (and I don't know how) missing data may be there, with nansfor

    n_items = int(np.round(len(obs_avg) / (n_bins - 1)))
    obs_cut = obs_avg.rank(method='min') // n_items
    control_genes = set()

    # now pick `ctrl_size` genes from every cut
    for cut in np.unique(obs_cut.loc[gene_list]):
        r_genes = np.array(obs_cut[obs_cut == cut].index)
        np.random.shuffle(r_genes)
        # uses full r_genes if ctrl_size > len(r_genes)
        control_genes.update(set(r_genes[:ctrl_size]))

    # To index, we need a list – indexing implies an order.
    control_genes = list(control_genes - gene_list)
    gene_list = list(gene_list)

    X_list = _adata[:, gene_list].X
    if issparse(X_list):
        X_list = np.array(_sparse_nanmean(X_list, axis=1)).flatten()
    else:
        X_list = np.nanmean(X_list, axis=1, dtype='float64')

    X_control = _adata[:, control_genes].X
    if issparse(X_control):
        X_control = np.array(_sparse_nanmean(X_control, axis=1)).flatten()
    else:
        X_control = np.nanmean(X_control, axis=1, dtype='float64')

    score = X_list - X_control

    adata.obs[score_name] = pd.Series(
        np.array(score).ravel(), index=adata.obs_names, dtype='float64'
    )

    logg.info(
        '    finished',
        time=start,
        deep=(
            'added\n'
            f'    {score_name!r}, score of gene set (adata.obs).\n'
            f'    {len(control_genes)} total control genes are used.'
        ),
    )
    return adata if copy else None
Esempio n. 8
0
def critical_transition(
    adata: AnnData,
    root_milestone,
    milestones,
    n_map=1,
    n_jobs=None,
    layer: Optional[str] = None,
    w=100,
    step=30,
    loess_span=0.4,
    gamma=1.5,
    n_points=200,
    copy: bool = False,
):
    """\
    Estimates local critical transition index along the trajectory.

    Based from the concept of pre-bifurcation struture from [Bargaje17]_.
    This study proposes the idea that a signature indicating the flattening
    of the quasi-potential landscape can be detected prior to bifurcation.

    To detect this signal, this function estimates local critical transition
    index along the trajectory, by calculating along a moving window of cell
    the following:

    .. math::
        \\frac{<{\\left | R(g_i,g_j) \\right |>}}{<\\left | R(c_k,c_l) \\right |>}

    Which is the ratio between the mean of the absolute gene by gene correlations
    and the mean of the absolute cell by cell correlations.

    Parameters
    ----------
    adata
        Annotated data matrix.
    root_milestone
        tip defining progenitor branch.
    milestones
        tips defining the progenies branches.
    n_map
        number of probabilistic cells projection to use for estimates.
    n_jobs
        number of cpu processes to perform estimates (per mapping).
    layer
        adata layer to use for estimates.
    w
        local window, in number of cells, to estimate correlations.
    step
        steps, in number of cells, between local windows.
    loess_span
        fraction of points to take in account for loess fit
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    adata : anndata.AnnData
        if `copy=True` it returns subsetted or else subset (keeping only
        significant features) and add fields to `adata` for a bifurcation:

        `.uns['root_milestone->milestoneA<>milestoneB']['critical transition']`
            containing local critical transition index per window of cells.
        `.obs['root_milestone->milestoneA<>milestoneB pre-fork CI lowess']`
            local critical transition index loess fitted onto cells prior to bifurcation.

    For a linear trajectory:

        `.uns['root_milestone->milestoneA']['critical transition']`
            containing local critical transition index per window of cells.
        `.obs['root_milestone->milestoneA CI lowess']`
            local critical transition index loess fitted onto cells along the path.

    """

    adata = adata.copy() if copy else adata

    logg.info("Calculating local critical transition index", reset=True)

    graph = adata.uns["graph"]

    edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple,
                                                              axis=1).values
    img = igraph.Graph()
    img.add_vertices(
        np.unique(graph["pp_seg"][["from",
                                   "to"]].values.flatten().astype(str)))
    img.add_edges(edges)

    uns_temp = adata.uns.copy()

    if "milestones_colors" in adata.uns:
        mlsc = adata.uns["milestones_colors"].copy()

    dct = graph["milestones"]
    keys = np.array(list(dct.keys()))
    vals = np.array(list(dct.values()))

    leaves = list(map(lambda leave: dct[leave], milestones))
    root = dct[root_milestone]

    name = root_milestone + "->" + "<>".join(milestones)

    def critical_map(m, gamma, loess_span):
        df = adata.uns["pseudotime_list"][str(m)]
        edges = graph["pp_seg"][["from",
                                 "to"]].astype(str).apply(tuple, axis=1).values
        img = igraph.Graph()
        img.add_vertices(
            np.unique(graph["pp_seg"][["from",
                                       "to"]].values.flatten().astype(str)))
        img.add_edges(edges)

        def critical_milestone(leave):
            cells = getpath(img, root, graph["tips"], leave, graph, df).index

            X = get_X(adata, cells, adata.var_names, layer)
            mat = pd.DataFrame(X, index=cells, columns=adata.var_names)

            mat = mat.iloc[adata.obs.t[mat.index].argsort().values, :]

            def slide_path(i):
                cls = mat.index[i:(i + w)]
                cor_gene = mat.loc[cls, :].corr(method="pearson").values
                cor_cell = mat.loc[cls, :].T.corr(method="pearson").values
                R_gene = np.nanmean(
                    np.abs(cor_gene[np.triu_indices(cor_gene.shape[0], k=1)]))
                R_cell = np.nanmean(
                    np.abs(cor_cell[np.triu_indices(cor_cell.shape[0], k=1)]))
                return [adata.obs.t[cls].mean(), R_gene / R_cell, cls]

            wins = np.arange(0, mat.shape[0] - w, step)

            stats = ProgressParallel(
                n_jobs=n_jobs,
                total=len(wins),
                use_tqdm=n_map == 1,
                file=sys.stdout,
                desc="    to " + str(keys[vals == leave][0]),
            )(delayed(slide_path)(i) for i in wins)

            cells_l = [s[2] for s in stats]
            stats = pd.DataFrame([[s[0], s[1]] for s in stats],
                                 columns=("t", "ci"))

            l = loess(stats.t, stats.ci, span=loess_span)
            l.fit()
            pred = l.predict(stats.t, stderror=True)
            conf = pred.confidence()

            stats["lowess"] = pred.values
            stats["ll"] = conf.lower
            stats["ul"] = conf.upper

            cell_stats = [
                pd.DataFrame(
                    np.repeat(stats.ci[i].reshape(-1, 1), len(cells_l[i])),
                    index=cells_l[i],
                    columns=["ci"],
                ) for i in range(stats.shape[0])
            ]

            cell_stats = pd.concat(cell_stats, axis=1)
            cell_stats = cell_stats.T.groupby(level=0).mean().T
            cell_stats["t"] = adata.obs.loc[cell_stats.index, "t"]

            l = loess(cell_stats.t, cell_stats.ci, span=loess_span)
            pred = l.predict(cell_stats.t, stderror=True)

            cell_stats["fit"] = pred.values

            lspaced_stats = pd.DataFrame({
                "t":
                np.linspace(cell_stats["t"].min(), cell_stats["t"].max(),
                            n_points)
            })
            pred = l.predict(lspaced_stats.t, stderror=True)
            lspaced_stats["fit"] = pred.values

            del cell_stats["t"]
            return stats, cell_stats, lspaced_stats

        res = list(map(critical_milestone, leaves))

        cell_stats = pd.concat([r[1] for r in res]).groupby(level=0).mean()

        res_slide = dict(zip(milestones, [r[0] for r in res]))

        res_lspaced = dict(zip(milestones, [r[2] for r in res]))

        return cell_stats, res_slide, res_lspaced

    if n_map == 1:
        df, res_slide, res_lspaced = critical_map(0, gamma, loess_span)
    else:
        # TODO: adapt multimapping
        stats = Parallel(n_jobs=n_jobs)(delayed(critical_map)(i) for i in tqdm(
            range(n_map), file=sys.stdout, desc="    multi mapping "))
        res_slides = pd.concat(stats)

    if name in adata.uns:
        adata.uns[name]["critical transition"] = {
            "LOESS": res_slide,
            "eLOESS": res_lspaced,
        }
    else:
        adata.uns[name] = {
            "critical transition": {
                "LOESS": res_slide,
                "eLOESS": res_lspaced
            }
        }

    adata.obs.loc[df.index, name + " CI"] = df.ci.values

    adata.obs.loc[df.index, name + " CI fitted"] = df.fit.values

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    .uns['" + name +
        "']['critical transition'], df containing local critical transition index per window of cells.\n"
        "    .obs['" + name +
        " CI'], local critical transition index projected onto cells.\n"
        "    .obs['" + name +
        " CI fitted'], GAM fit of local critical transition index projected onto cells."
    )

    return adata if copy else None
Esempio n. 9
0
def criticality_drivers(
    adata: AnnData,
    root_milestone,
    milestones,
    t_span=None,
    confidence_level: float = 0.95,
    layer: Optional[str] = None,
    device="cpu",
    copy: bool = False,
):
    """\
    Calculates correlations between genes and local critical transition index along trajectory.

    Fisher test for the correlations comes from CellRank function `cr.tl.lineages_drivers`.

    Parameters
    ----------
    adata
        Annotated data matrix.
    root_milestone
        tip defining progenitor branch.
    milestones
        tips defining the progenies branches.
    t_span
        restrict correlations to a window of pseudotime
    confidence_level
        correlation confidence interval.
    layer
        adata layer to use for estimates.
    device
        whether to run the correlation matrix computation on a cpu or gpu.
    loess_span
        fraction of points to take in account for loess fit
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    adata : anndata.AnnData
        if `copy=True` it returns subsetted or else subset (keeping only
        significant features) and add fields to `adata`:

        `.uns['root_milestone->milestoneA<>milestoneB']['criticality drivers'].
            a df containing gene correlation with critical index transition.


    """

    adata = adata.copy() if copy else adata

    logg.info("Calculating gene to critical transition index correlations",
              reset=True)

    name = root_milestone + "->" + "<>".join(milestones)
    obs_name = name + " CI fitted"

    if t_span is None:
        cells = adata.obs_names[~np.isnan(adata.obs[obs_name])]
    else:
        cells = adata.obs_names[(~np.isnan(adata.obs[obs_name]))
                                & (adata.obs.t > t_span[0])
                                & (adata.obs.t < t_span[1])]

    CI = adata[cells].obs[obs_name].values

    if layer is None:
        X = adata[cells].X
    else:
        X = adata[cells].layers[layer]

    if device == "cpu":
        from .utils import cor_mat_cpu

        X = X.A if sparse.issparse(X) else X
        corr = cor_mat_cpu(X, CI.reshape(-1, 1)).ravel()
    else:
        from .utils import cor_mat_gpu
        import cupy as cp
        from cupyx.scipy.sparse import csr_matrix as csr_gpu

        X = csr_gpu(X) if sparse.issparse(X) else cp.array(X)
        corr = cor_mat_gpu(X, cp.array(CI).reshape(-1, 1)).ravel().get()

    ### Fisher testing of correlations, CellRank implementation
    ### https://github.com/theislab/cellrank/blob/b6345d5e6dd148317782ffc9a9f96793ad98ead9/cellrank/tl/_utils.py#L488
    ### Copyright (c) 2019, Theis Lab

    confidence_level = 0.95
    n = adata.shape[0]
    ql = 1 - confidence_level - (1 - confidence_level) / 2.0
    qh = confidence_level + (1 - confidence_level) / 2.0

    mean, se = np.arctanh(corr), 1.0 / np.sqrt(n - 3)
    z_score = (np.arctanh(corr) - np.arctanh(0)) * np.sqrt(n - 3)

    z = norm.ppf(qh)
    corr_ci_low = np.tanh(mean - z * se)
    corr_ci_high = np.tanh(mean + z * se)
    pvals = 2 * norm.cdf(-np.abs(z_score))

    ###

    res = pd.DataFrame(
        {
            "corr": corr,
            "pval": pvals,
            "ci_low": corr_ci_low,
            "ci_high": corr_ci_high
        },
        index=adata.var_names,
    )

    res["q_val"] = np.nan
    res.loc[~np.isnan(pvals),
            "q_val"] = multipletests(res[~np.isnan(pvals)].pval.values,
                                     alpha=0.05,
                                     method="fdr_bh")[1]

    adata.uns[name]["criticality drivers"] = res.sort_values(
        "corr", ascending=False).dropna()

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    .uns['" + name +
        "']['criticality drivers'], df containing gene correlation with critical index transition."
    )

    return adata if copy else None
Esempio n. 10
0
def synchro_path(
    adata: AnnData,
    root_milestone,
    milestones,
    genesetA: Optional[Iterable] = None,
    genesetB: Optional[Iterable] = None,
    n_map=1,
    n_jobs=None,
    layer: Optional[str] = None,
    perm=True,
    w=200,
    step=30,
    winp=10,
    loess_span=0.2,
    copy: bool = False,
):
    """\
    Estimates pseudotime trends of local intra- and inter-module correlations of fates-specific modules.

    Parameters
    ----------
    adata
        Annotated data matrix.
    root_milestone
        tip defining progenitor branch.
    milestones
        tips defining the progenies branches.
    n_map
        number of probabilistic cells projection to use for estimates.
    n_jobs
        number of cpu processes to perform estimates (per mapping).
    layer
        adata layer to use for estimates.
    perm
        estimate control trends for local permutations instead of real expression matrix.
    w
        local window, in number of cells, to estimate correlations.
    step
        steps, in number of cells, between local windows.
    winp
        window of permutation in cells.
    loess_span
        fraction of points to take in account for loess fit
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    adata : anndata.AnnData
        if `copy=True` it returns subsetted or else subset (keeping only
        significant features) and add fields to `adata`:

        `.uns['root_milestone->milestoneA<>milestoneB']['synchro']`
            Dataframe containing mean local gene-gene correlations of all possible gene pairs inside one module, or between the two modules.
        `.obs['intercor root_milestone->milestoneA<>milestoneB']`
            loess fit of inter-module mean local gene-gene correlations prior to bifurcation

    """

    adata = adata.copy() if copy else adata

    logg.info("computing local correlations", reset=True)

    graph = adata.uns["graph"]

    edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple,
                                                              axis=1).values
    img = igraph.Graph()
    img.add_vertices(
        np.unique(graph["pp_seg"][["from",
                                   "to"]].values.flatten().astype(str)))
    img.add_edges(edges)

    uns_temp = adata.uns.copy()

    if "milestones_colors" in adata.uns:
        mlsc = adata.uns["milestones_colors"].copy()

    dct = graph["milestones"]
    keys = np.array(list(dct.keys()))
    vals = np.array(list(dct.values()))

    leaves = list(map(lambda leave: dct[leave], milestones))
    root = dct[root_milestone]

    name = root_milestone + "->" + "<>".join(milestones)

    if genesetA is None:
        bif = adata.uns[name]["fork"]
        genesetA = bif.index[(bif.module == "early")
                             & (bif.branch == milestones[0])]
        genesetB = bif.index[(bif.module == "early")
                             & (bif.branch == milestones[1])]

    genesets = np.concatenate([genesetA, genesetB])

    if n_map == 1:
        logg.info("    single mapping")

    def synchro_map(m):
        df = adata.uns["pseudotime_list"][str(m)]
        edges = graph["pp_seg"][["from",
                                 "to"]].astype(str).apply(tuple, axis=1).values
        img = igraph.Graph()
        img.add_vertices(
            np.unique(graph["pp_seg"][["from",
                                       "to"]].values.flatten().astype(str)))
        img.add_edges(edges)

        def synchro_milestone(leave):
            cells = getpath(img, root, graph["tips"], leave, graph, df)
            cells = cells.sort_values("t").index

            X = get_X(adata, cells, genesets, layer)
            mat = pd.DataFrame(X, index=cells, columns=genesets)

            if permut == True:
                winperm = np.min([winp, mat.shape[0]])
                for i in np.arange(0, mat.shape[0] - winperm, winperm):
                    mat.iloc[i:(i + winperm), :] = (mat.iloc[i:(
                        i + winperm), :].apply(np.random.permutation,
                                               axis=0).values)

            def slide_path(i):
                cls = mat.index[i:(i + w)]
                cor = mat.loc[cls, :].corr(method="spearman")
                corA = cor.loc[:, genesetA].mean(axis=1)
                corB = cor.loc[:, genesetB].mean(axis=1)
                corA[genesetA] = ((corA[genesetA] - 1 / len(genesetA)) *
                                  len(genesetA) / (len(genesetA) - 1))
                corB[genesetB] = ((corB[genesetB] - 1 / len(genesetB)) *
                                  len(genesetB) / (len(genesetB) - 1))

                return pd.Series({
                    "t":
                    adata.obs.t[cls].mean(),
                    "dist":
                    (corA[genesetA].mean() - corA[genesetB].mean())**2 +
                    (corB[genesetA].mean() - corB[genesetB].mean())**2,
                    "corAA":
                    corA[genesetA].mean(),
                    "corBB":
                    corB[genesetB].mean(),
                    "corAB":
                    corA[genesetB].mean(),
                    "n_map":
                    m,
                })

            ww = np.arange(0, mat.shape[0] - w, step)

            res = ProgressParallel(
                n_jobs=n_jobs,
                total=len(ww),
                use_tqdm=n_map == 1,
                file=sys.stdout,
                desc="    to " + str(keys[vals == leave][0]),
            )(delayed(slide_path)(i) for i in ww)

            return pd.concat(res, axis=1).T

        return pd.concat(list(map(synchro_milestone, leaves)), keys=milestones)

    if n_map > 1:
        permut = False
        stats = ProgressParallel(n_jobs=n_jobs,
                                 total=n_map,
                                 file=sys.stdout,
                                 desc="    multi mapping")(
                                     delayed(synchro_map)(i)
                                     for i in range(n_map))
        allcor_r = pd.concat(stats)
        if perm:
            permut = True

            stats = ProgressParallel(
                n_jobs=n_jobs,
                total=n_map,
                file=sys.stdout,
                desc="    multi mapping permutations",
            )(delayed(synchro_map)(i) for i in range(n_map))
            allcor_p = pd.concat(stats)
            allcor = pd.concat([allcor_r, allcor_p], keys=["real", "permuted"])
        else:
            allcor = pd.concat([allcor_r], keys=["real"])
    else:
        permut = False
        allcor_r = pd.concat(list(map(synchro_map, range(n_map))))

        if perm:
            permut = True
            allcor_p = pd.concat(list(map(synchro_map, range(n_map))))
            allcor = pd.concat([allcor_r, allcor_p], keys=["real", "permuted"])
        else:
            allcor = pd.concat([allcor_r], keys=["real"])

    runs = pd.DataFrame(allcor.to_records())["level_0"].unique()

    dct_cormil = dict(
        zip(
            ["corAA", "corBB", "corAB"],
            [
                milestones[0] + "\nintra-module",
                milestones[1] + "\nintra-module"
            ] + [milestones[0] + " vs " + milestones[1] + "\ninter-module"],
        ))
    logg.info(" done, computing LOESS fit")
    for cc in ["corAA", "corBB", "corAB"]:
        allcor[cc + "_lowess"] = 0
        allcor[cc + "_ll"] = 0
        allcor[cc + "_ul"] = 0
        for r in range(len(runs)):
            for mil in milestones:
                res = allcor.loc[runs[r]].loc[mil]
                l = loess(res.t, res[cc], span=loess_span)
                l.fit()
                pred = l.predict(res.t, stderror=True)
                conf = pred.confidence()

                allcor.loc[(runs[r], mil), cc + "_lowess"] = pred.values
                allcor.loc[(runs[r], mil), cc + "_ll"] = conf.lower
                allcor.loc[(runs[r], mil), cc + "_ul"] = conf.upper

    fork = list(
        set(img.get_shortest_paths(str(root), str(leaves[0]))[0]).intersection(
            img.get_shortest_paths(str(root), str(leaves[1]))[0]))
    fork = np.array(img.vs["name"], dtype=int)[fork]
    fork_t = adata.uns["graph"]["pp_info"].loc[fork, "time"].max()
    res = allcor.loc[allcor.t < fork_t, :]
    res = res[~res.t.duplicated()]
    l = loess(res.t, res["corAB"], span=loess_span)
    l.fit()
    pred = l.predict(res.t, stderror=True)

    tval = adata.obs.t.copy()
    tval[tval > fork_t] = np.nan

    def inter_values(tv):
        if ~np.isnan(tv):
            return pred.values[np.argmin(np.abs(res.t.values - tv))]
        else:
            return tv

    adata.obs["inter_cor " + name] = list(map(inter_values, tval))

    df = adata.uns["pseudotime_list"][str(0)]
    cells = np.concatenate([
        getpath(img, root, graph["tips"], leaves[0], graph, df).index,
        getpath(img, root, graph["tips"], leaves[1], graph, df).index,
    ])

    adata.obs.loc[~adata.obs_names.isin(cells), "inter_cor " + name] = np.nan

    adata.uns = uns_temp

    allcor = dict(
        zip(
            allcor.index.levels[0],
            [
                dict(
                    zip(
                        allcor.loc[l1].index.levels[0],
                        [
                            allcor.loc[l1].loc[l2]
                            for l2 in allcor.loc[l1].index.levels[0]
                        ],
                    )) for l1 in allcor.index.levels[0]
            ],
        ))

    if name in adata.uns:
        adata.uns[name]["synchro"] = allcor
    else:
        adata.uns[name] = {"synchro": allcor}

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    .uns['" + name +
        "']['synchro'], mean local gene-gene correlations of all possible gene pairs inside one module, or between the two modules.\n"
        "    .obs['inter_cor " + name +
        "'], loess fit of inter-module mean local gene-gene correlations prior to bifurcation."
    )

    return adata if copy else None
Esempio n. 11
0
def synchro_path_multi(adata: AnnData,
                       root_milestone,
                       milestones,
                       copy=False,
                       **kwargs):
    """\
    Wrappers that call `tl.synchro_path` on the pairwise combination of all selected branches.

    Parameters
    ----------
    adata
        Annotated data matrix.
    root_milestone
        tip defining progenitor branch.
    milestones
        tips defining the progenies branches.
    kwargs
        arguments to pass to tl.synchro_path.

    Returns
    -------
    adata : anndata.AnnData
        if `copy=True` it returns subsetted or else subset (keeping only
        significant features) and add fields to `adata`:

        `.uns['root_milestone->milestoneA<>milestoneB']['synchro']`
            Dataframe containing mean local gene-gene correlations of all possible gene pairs inside one module, or between the two modules.
        `.obs['intercor root_milestone->milestoneA<>milestoneB']`
            loess fit of inter-module mean local gene-gene correlations prior to bifurcation

    """

    adata = adata.copy() if copy else adata

    logg.info("computing local correlations", reset=True)

    graph = adata.uns["graph"]

    edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple,
                                                              axis=1).values
    img = igraph.Graph()
    img.add_vertices(
        np.unique(graph["pp_seg"][["from",
                                   "to"]].values.flatten().astype(str)))
    img.add_edges(edges)

    uns_temp = adata.uns.copy()

    if "milestones_colors" in adata.uns:
        mlsc = adata.uns["milestones_colors"].copy()

    dct = graph["milestones"]
    keys = np.array(list(dct.keys()))
    vals = np.array(list(dct.values()))

    leaves = list(map(lambda leave: dct[leave], milestones))
    root = dct[root_milestone]

    name = root_milestone + "->" + "<>".join(milestones)

    bif = adata.uns[name]["fork"]

    genesets = dict(
        zip(
            milestones,
            [
                bif.index[(bif.module == "early") & (bif.branch == m)]
                for m in milestones
            ],
        ))

    pairs = list(itertools.combinations(milestones, 2))

    for m_pair in pairs:
        synchro_path(adata,
                     root_milestone,
                     m_pair,
                     genesetA=genesets[m_pair[0]],
                     genesetB=genesets[m_pair[1]],
                     **kwargs)
Esempio n. 12
0
def scanvi(
    train_adata: anndata.AnnData,
    test_adata: anndata.AnnData,
    cell_type_col: str,
    n_per_class: int = 100,
    **kwargs,
) -> (np.ndarray, pd.DataFrame, anndata.AnnData, scvi.model.scanvi.SCANVI):
    '''Use SCANVI to transfer annotations.
    
    Parameters
    ----------
    train_adata : anndata.AnnData
        [Cells, Genes] for training.
    test_adata : anndata.AnnData
        [Cells, Genes] for testing.
    cell_type_col : str
        column labeling ground truth cell types in
        `train_adata` and `test_adata`.
    n_per_class : int
        number of training examples per class. scANVI authors
        recommend `100` (default).
    
    Returns
    -------
    predictions : np.ndarray
        [Cells,] cell type label predictions.
    probabilities : pd.DataFrame
        [Cells, Class] probabilities.
        classes are column labels.
    adata : anndata.AnnData
        [Cells, Genes] concatenation of `train_adata` and `test_adata`
        with the learned scANVI latent space in `.obsm['X_scANVI']`.
        class predictions are in `.obs['C_scANVI']`.
    lvae : scvi.model.scanvi.SCANVI
        a trained scANVI model object.
    
    Notes
    -----
    This implementation exactly follows the working example
    from the `scvi` authors.
    
    https://www.scvi-tools.org/en/0.7.0-alpha.4/user_guide/notebooks/harmonization.html
    '''
    # check that train_adata and test_adata contain
    # raw counts
    tr_int = isinteger(train_adata.X)
    te_int = isinteger(test_adata.X)
    if not (tr_int and te_int):
        # check if the raw counts are in `.raw`
        tr_int = isinteger(train_adata.raw.X)
        te_int = isinteger(test_adata.raw.X)
        if tr_int and te_int:
            train_adata = train_adata.copy()
            test_adata = test_adata.copy()
            # set raw counts to `X`
            train_adata.X = train_adata.raw[:, train_adata.var_names].X
            test_adata.X = test_adata.raw[:, test_adata.var_names].X
        else:
            msg = 'Integer raw counts not found.'
            raise ValueError(msg)
    else:
        # raw counts are already set to X
        pass

    # `.concatenate()` creates batch labels in `.obs['batch']`
    adata = train_adata.concatenate(test_adata)

    # store raw counts in a new layer
    # normalize and select highly variable genes
    #
    # scVI uses only a set of highly variable genes
    # to perform data integration in the latent space
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata  # keep full dimension safe
    sc.pp.highly_variable_genes(adata,
                                flavor="seurat_v3",
                                n_top_genes=2000,
                                layer="counts",
                                batch_key="batch",
                                subset=True)

    # assign cell type labels for training
    # scANVI uses a special token `"Unknown"` for cells that are not labeled
    # the authors recommend using 100 cells per cell type
    # from the training set to balance classes
    adata.obs[cell_type_col] = pd.Categorical(adata.obs[cell_type_col], )

    labels = np.repeat("Unknown", adata.shape[0])
    labels = labels.astype("<U43")
    for x in np.unique(adata.obs[cell_type_col]):
        idx = np.where((adata.obs[cell_type_col] == x)
                       & (adata.obs["batch"] == "0"))[0]
        sampled = np.random.choice(idx, np.min([n_per_class, len(idx)]))
        labels[sampled] = adata.obs[cell_type_col][sampled]

    adata.obs["celltype_scanvi"] = labels

    # setup scANVI for training
    scvi.data.setup_anndata(
        adata,
        layer="counts",
        batch_key="batch",
        labels_key="celltype_scanvi",
    )

    # fit the semi-supervised scANVI model
    lvae = scvi.model.SCANVI(
        adata,
        "Unknown",
        use_cuda=True,
        n_latent=30,
        n_layers=2,
    )

    lvae.train(n_epochs_semisupervised=100)

    # extract labels
    adata.obs["C_scANVI"] = lvae.predict(adata)
    adata.obsm["X_scANVI"] = lvae.get_latent_representation(adata)

    predictions = np.array(adata.obs.loc[adata.obs['batch'] == '1',
                                         "C_scANVI"])
    # returns a [Cells, Classes] data frame with class
    # names as column labels and cell barcodes as indices
    probabilities = lvae.predict(adata, soft=True)
    probabilities = probabilities.loc[adata.obs['batch'] == '1']
    # scANVI will add the "Unknown" token as a class, usually
    # with very low probability
    # here we drop it, then renorm probabilities to == 1
    probabilities = probabilities.drop(columns=['Unknown'])
    probabilities = probabilities / np.tile(
        np.array(probabilities.sum(1)).reshape(-1, 1),
        (1, probabilities.shape[1]))
    # check probability normalization
    eq1 = np.allclose(
        probabilities.sum(1),
        np.ones(probabilities.shape[0]),
    )
    if not eq1:
        msg = 'Not all sum(probabilities) are close to 1.'
        n = np.sum(probabilities.sum(1) != 1.)
        msg += f'{n} cells have probabilities != 1.'
        raise ValueError(msg)

    r = (
        predictions,
        probabilities,
        adata,
        lvae,
    )

    return r
Esempio n. 13
0
def scrublet(
    adata: AnnData,
    adata_sim: Optional[AnnData] = None,
    batch_key: str = None,
    sim_doublet_ratio: float = 2.0,
    expected_doublet_rate: float = 0.05,
    stdev_doublet_rate: float = 0.02,
    synthetic_doublet_umi_subsampling: float = 1.0,
    knn_dist_metric: str = 'euclidean',
    normalize_variance: bool = True,
    log_transform: bool = False,
    mean_center: bool = True,
    n_prin_comps: int = 30,
    use_approx_neighbors: bool = True,
    get_doublet_neighbor_parents: bool = False,
    n_neighbors: Optional[int] = None,
    threshold: Optional[float] = None,
    verbose: bool = True,
    copy: bool = False,
    random_state: int = 0,
) -> Optional[AnnData]:
    """\
    Predict doublets using Scrublet [Wolock19]_.

    Predict cell doublets using a nearest-neighbor classifier of observed
    transcriptomes and simulated doublets. Works best if the input is a raw
    (unnormalized) counts matrix from a single sample or a collection of
    similar samples from the same experiment.
    This function is a wrapper around functions that pre-process using Scanpy
    and directly call functions of Scrublet(). You may also undertake your own
    preprocessing, simulate doublets with
    scanpy.external.pp.scrublet_simulate_doublets(), and run the core scrublet
    function scanpy.external.pp.scrublet.scrublet().

    .. note::
        More information and bug reports `here
        <https://github.com/swolock/scrublet>`__.

    Parameters
    ----------
    adata
        The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows
        correspond to cells and columns to genes. Expected to be un-normalised
        where adata_sim is not supplied, in which case doublets will be
        simulated and pre-processing applied to both objects. If adata_sim is
        supplied, this should be the observed transcriptomes processed
        consistently (filtering, transform, normalisaton, hvg) with adata_sim.
    adata_sim
        (Advanced use case) Optional annData object generated by
        sc.external.pp.scrublet_simulate_doublets(), with same number of vars
        as adata. This should have been built from adata_obs after
        filtering genes and cells and selcting highly-variable genes.
    batch_key
        Optional `adata.obs` column name discriminating between batches.
    sim_doublet_ratio
        Number of doublets to simulate relative to the number of observed
        transcriptomes.
    expected_doublet_rate
        Where adata_sim not suplied, the estimated doublet rate for the
        experiment.
    stdev_doublet_rate
        Where adata_sim not suplied, uncertainty in the expected doublet rate.
    synthetic_doublet_umi_subsampling
        Where adata_sim not suplied, rate for sampling UMIs when creating
        synthetic doublets. If 1.0, each doublet is created by simply adding
        the UMI counts from two randomly sampled observed transcriptomes. For
        values less than 1, the UMI counts are added and then randomly sampled
        at the specified rate.
    knn_dist_metric
        Distance metric used when finding nearest neighbors. For list of
        valid values, see the documentation for annoy (if `use_approx_neighbors`
        is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors`
        is False).
    normalize_variance
        If True, normalize the data such that each gene has a variance of 1.
        `sklearn.decomposition.TruncatedSVD` will be used for dimensionality
        reduction, unless `mean_center` is True.
    log_transform
        Whether to use :func:``~scanpy.pp.log1p`` to log-transform the data
        prior to PCA.
    mean_center
        If True, center the data such that each gene has a mean of 0.
        `sklearn.decomposition.PCA` will be used for dimensionality
        reduction.
    n_prin_comps
        Number of principal components used to embed the transcriptomes prior
        to k-nearest-neighbor graph construction.
    use_approx_neighbors
        Use approximate nearest neighbor method (annoy) for the KNN
        classifier.
    get_doublet_neighbor_parents
        If True, return (in .uns) the parent transcriptomes that generated the
        doublet neighbors of each observed transcriptome. This information can
        be used to infer the cell states that generated a given doublet state.
    n_neighbors
        Number of neighbors used to construct the KNN graph of observed
        transcriptomes and simulated doublets. If ``None``, this is
        automatically set to ``np.round(0.5 * np.sqrt(n_obs))``.
    threshold
        Doublet score threshold for calling a transcriptome a doublet. If
        `None`, this is set automatically by looking for the minimum between
        the two modes of the `doublet_scores_sim_` histogram. It is best
        practice to check the threshold visually using the
        `doublet_scores_sim_` histogram and/or based on co-localization of
        predicted doublets in a 2-D embedding.
    verbose
        If True, print progress updates.
    copy
        If ``True``, return a copy of the input ``adata`` with Scrublet results
        added. Otherwise, Scrublet results are added in place.
    random_state
        Initial state for doublet simulation and nearest neighbors.

    Returns
    -------
    adata : anndata.AnnData
        if ``copy=True`` it returns or else adds fields to ``adata``. Those fields:

        ``.obs['doublet_score']``
            Doublet scores for each observed transcriptome

        ``.obs['predicted_doublets']``
            Boolean indicating predicted doublet status

        ``.uns['scrublet']['doublet_scores_sim']``
            Doublet scores for each simulated doublet transcriptome

        ``.uns['scrublet']['doublet_parents']``
            Pairs of ``.obs_names`` used to generate each simulated doublet
            transcriptome

        ``.uns['scrublet']['parameters']``
            Dictionary of Scrublet parameters

    See also
    --------
    :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet
        simulation separately for advanced usage.
    :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet
        scores for observed transcriptomes and simulated doublets.
    """
    try:
        import scrublet as sl
    except ImportError:
        raise ImportError(
            'Please install scrublet: `pip install scrublet` or `conda install scrublet`.'
        )

    if copy:
        adata = adata.copy()

    start = logg.info('Running Scrublet')

    adata_obs = adata.copy()

    def _run_scrublet(ad_obs, ad_sim=None):

        # With no adata_sim we assume the regular use case, starting with raw
        # counts and simulating doublets

        if ad_sim is None:

            pp.filter_genes(ad_obs, min_cells=3)
            pp.filter_cells(ad_obs, min_genes=3)

            # Doublet simulation will be based on the un-normalised counts, but on the
            # selection of genes following normalisation and variability filtering. So
            # we need to save the raw and subset at the same time.

            ad_obs.layers['raw'] = ad_obs.X.copy()
            pp.normalize_total(ad_obs)

            # HVG process needs log'd data.

            logged = pp.log1p(ad_obs, copy=True)
            pp.highly_variable_genes(logged)
            ad_obs = ad_obs[:, logged.var['highly_variable']]

            # Simulate the doublets based on the raw expressions from the normalised
            # and filtered object.

            ad_sim = scrublet_simulate_doublets(
                ad_obs,
                layer='raw',
                sim_doublet_ratio=sim_doublet_ratio,
                synthetic_doublet_umi_subsampling=
                synthetic_doublet_umi_subsampling,
            )

            if log_transform:
                pp.log1p(ad_obs)
                pp.log1p(ad_sim)

            # Now normalise simulated and observed in the same way

            pp.normalize_total(ad_obs, target_sum=1e6)
            pp.normalize_total(ad_sim, target_sum=1e6)

        ad_obs = _scrublet_call_doublets(
            adata_obs=ad_obs,
            adata_sim=ad_sim,
            n_neighbors=n_neighbors,
            expected_doublet_rate=expected_doublet_rate,
            stdev_doublet_rate=stdev_doublet_rate,
            mean_center=mean_center,
            normalize_variance=normalize_variance,
            n_prin_comps=n_prin_comps,
            use_approx_neighbors=use_approx_neighbors,
            knn_dist_metric=knn_dist_metric,
            get_doublet_neighbor_parents=get_doublet_neighbor_parents,
            threshold=threshold,
            random_state=random_state,
            verbose=verbose,
        )

        return {'obs': ad_obs.obs, 'uns': ad_obs.uns['scrublet']}

    if batch_key is not None:
        if batch_key not in adata.obs.keys():
            raise ValueError(
                '`batch_key` must be a column of .obs in the input annData object.'
            )

        # Run Scrublet independently on batches and return just the
        # scrublet-relevant parts of the objects to add to the input object

        batches = np.unique(adata.obs[batch_key])
        scrubbed = [
            _run_scrublet(
                adata_obs[adata_obs.obs[batch_key] == batch, ],
                adata_sim,
            ) for batch in batches
        ]
        scrubbed_obs = pd.concat([scrub['obs'] for scrub in scrubbed])

        # Now reset the obs to get the scrublet scores

        adata.obs = scrubbed_obs.loc[adata.obs_names.values]

        # Save the .uns from each batch separately

        adata.uns['scrublet'] = {}
        adata.uns['scrublet']['batches'] = dict(
            zip(batches, [scrub['uns'] for scrub in scrubbed]))

        # Record that we've done batched analysis, so e.g. the plotting
        # function knows what to do.

        adata.uns['scrublet']['batched_by'] = batch_key

    else:
        scrubbed = _run_scrublet(adata_obs, adata_sim)

        # Copy outcomes to input object from our processed version

        adata.obs['doublet_score'] = scrubbed['obs']['doublet_score']
        adata.obs['predicted_doublet'] = scrubbed['obs']['predicted_doublet']
        adata.uns['scrublet'] = scrubbed['uns']

    logg.info('    Scrublet finished', time=start)

    if copy:
        return adata
    else:
        return None
Esempio n. 14
0
def magic(
    adata: AnnData,
    name_list: Union[str, Sequence[str], None] = None,
    k: int = 10,
    a: int = 15,
    t: str = 'auto',
    n_pca: int = 100,
    knn_dist: str = 'euclidean',
    random_state: Optional[Union[int, RandomState]] = None,
    n_jobs: Optional[int] = None,
    verbose: bool = False,
    copy: Optional[bool] = None,
    **kwargs,
) -> Optional[AnnData]:
    """\
    Markov Affinity-based Graph Imputation of Cells (MAGIC) API [vanDijk18]_.

    MAGIC is an algorithm for denoising and transcript recover of single cells
    applied to single-cell sequencing data. MAGIC builds a graph from the data
    and uses diffusion to smooth out noise and recover the data manifold.

    More information and bug reports
    `here <https://github.com/KrishnaswamyLab/MAGIC>`__. For help, visit
    <https://krishnaswamylab.org/get-help>.

    Parameters
    ----------
    adata
        An anndata file with `.raw` attribute representing raw counts.
    name_list
        Denoised genes to return. The default `'all_genes'`/`None`
        may require a large amount of memory if the input data is sparse.
        Another possibility is `'pca_only'`.
    k
        number of nearest neighbors on which to build kernel
    a
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    t
        power to which the diffusion operator is powered.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data
    n_pca
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist
        recommended values: 'euclidean', 'cosine', 'precomputed'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph. If 'precomputed',
        `data` should be an n_samples x n_samples distance or
        affinity matrix
    random_state
        Random seed. Defaults to the global `numpy` random number generator
    n_jobs
        Number of threads to use in training. All cores are used by default.
    verbose
        If `True` or an integer `>= 2`, print status messages.
        If `None`, `sc.settings.verbosity` is used.
    copy
        If true, a copy of anndata is returned. If `None`, `copy` is True if
        `genes` is not `'all_genes'` or `'pca_only'`. `copy` may only be False
        if `genes` is `'all_genes'` or `'pca_only'`, as the resultant data
        will otherwise have different column names from the input data.
    kwargs
        Additional arguments to `magic.MAGIC`

    Returns
    -------
    If `copy` is True, AnnData object is returned.

    If `subset_genes` is not `all_genes`, PCA on MAGIC values of cells are
    stored in `adata.obsm['X_magic']` and `adata.X` is not modified.

    The raw counts are stored in `.raw` attribute of AnnData object.

    Examples
    --------
    >>> import scanpy as sc
    >>> import scanpy.external as sce
    >>> adata = sc.datasets.paul15()
    >>> sc.pp.normalize_per_cell(adata)
    >>> sc.pp.sqrt(adata)  # or sc.pp.log1p(adata)
    >>> adata_magic = sce.pp.magic(adata, name_list=['Mpo', 'Klf1', 'Ifitm1'], k=5)
    >>> adata_magic.shape
    (2730, 3)
    >>> sce.pp.magic(adata, name_list='pca_only', k=5)
    >>> adata.obsm['X_magic'].shape
    (2730, 100)
    >>> sce.pp.magic(adata, name_list='all_genes', k=5)
    >>> adata.X.shape
    (2730, 3451)
    """

    try:
        from magic import MAGIC
    except ImportError:
        raise ImportError(
            'Please install magic package via `pip install --user '
            'git+git://github.com/KrishnaswamyLab/MAGIC.git#subdirectory=python`'
        )

    start = logg.info('computing PHATE')
    all_or_pca = isinstance(name_list, (str, type(None)))
    if all_or_pca and name_list not in {"all_genes", "pca_only", None}:
        raise ValueError("Invalid string value for `name_list`: "
                         "Only `'all_genes'` and `'pca_only'` are allowed.")
    if copy is None:
        copy = not all_or_pca
    elif not all_or_pca and not copy:
        raise ValueError(
            "Can only perform MAGIC in-place with `name_list=='all_genes' or "
            f"`name_list=='pca_only'` (got {name_list}). Consider setting "
            "`copy=True`")
    adata = adata.copy() if copy else adata
    n_jobs = settings.n_jobs if n_jobs is None else n_jobs

    X_magic = MAGIC(
        k=k,
        a=a,
        t=t,
        n_pca=n_pca,
        knn_dist=knn_dist,
        random_state=random_state,
        n_jobs=n_jobs,
        verbose=verbose,
        **kwargs,
    ).fit_transform(adata, genes=name_list)
    logg.info(
        '    finished',
        time=start,
        deep=("added\n    'X_magic', PCA on MAGIC coordinates (adata.obsm)"
              if name_list == "pca_only" else ''),
    )
    # update AnnData instance
    if name_list == "pca_only":
        # special case - update adata.obsm with smoothed values
        adata.obsm["X_magic"] = X_magic.X
    elif copy:
        # just return X_magic
        X_magic.raw = adata
        adata = X_magic
    else:
        # replace data with smoothed data
        adata.raw = adata
        adata.X = X_magic.X

    if copy:
        return adata
Esempio n. 15
0
def umap(adata: AnnData,
         min_dist: float = 0.5,
         spread: float = 1.0,
         n_components: int = 2,
         maxiter: Optional[int] = None,
         alpha: float = 1.0,
         gamma: float = 1.0,
         negative_sample_rate: int = 5,
         init_pos: Union[_InitPos, np.ndarray, None] = 'spectral',
         random_state: Optional[Union[int, RandomState]] = 0,
         a: Optional[float] = None,
         b: Optional[float] = None,
         copy: bool = False,
         method: Literal['umap', 'rapids'] = 'umap') -> Optional[AnnData]:
    """\
    Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata
        Annotated data matrix.
    min_dist
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components
        The number of dimensions of the embedding.
    maxiter
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha
        The initial learning rate for the embedding optimization.
    gamma
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP. Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy
        Return a copy instead of writing to adata.
    method
        Use the original 'umap' implementation, or 'rapids' (experimental, GPU only)

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata
    if 'neighbors' not in adata.uns:
        raise ValueError(
            'Did not find \'neighbors/connectivities\'. Run `sc.pp.neighbors` first.'
        )
    start = logg.info('computing UMAP')
    if ('params' not in adata.uns['neighbors']
            or adata.uns['neighbors']['params']['method'] != 'umap'):
        logg.warning(
            'neighbors/connectivities have not been computed using umap')
    from umap.umap_ import find_ab_params, simplicial_set_embedding
    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b
    adata.uns['umap'] = {'params': {'a': a, 'b': b}}
    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(adata, random_state=random_state)
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords,
                                  dtype=np.float32,
                                  accept_sparse=False)

    random_state = check_random_state(random_state)
    neigh_params = adata.uns['neighbors']['params']
    X = _choose_representation(adata,
                               neigh_params.get('use_rep', None),
                               neigh_params.get('n_pcs', None),
                               silent=True)
    if method == 'umap':
        # the data matrix X is really only used for determining the number of connected components
        # for the init condition in the UMAP embedding
        n_epochs = 0 if maxiter is None else maxiter
        X_umap = simplicial_set_embedding(
            X,
            adata.uns['neighbors']['connectivities'].tocoo(),
            n_components,
            alpha,
            a,
            b,
            gamma,
            negative_sample_rate,
            n_epochs,
            init_coords,
            random_state,
            neigh_params.get('metric', 'euclidean'),
            neigh_params.get('metric_kwds', {}),
            verbose=settings.verbosity > 3,
        )
    elif method == 'rapids':
        metric = neigh_params.get('metric', 'euclidean')
        if metric != 'euclidean':
            raise ValueError(
                f'`sc.pp.neighbors` was called with `metric` {metric!r}, '
                "but umap `method` 'rapids' only supports the 'euclidean' metric."
            )
        from cuml import UMAP
        n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
        n_epochs = 500 if maxiter is None else maxiter  # 0 is not a valid value for rapids, unlike original umap
        X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
        umap = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            n_epochs=n_epochs,
            learning_rate=alpha,
            init=init_pos,
            min_dist=min_dist,
            spread=spread,
            negative_sample_rate=negative_sample_rate,
            a=a,
            b=b,
            verbose=settings.verbosity > 3,
        )
        X_umap = umap.fit_transform(X_contiguous)
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              "    'X_umap', UMAP coordinates (adata.obsm)"),
    )
    return adata if copy else None
Esempio n. 16
0
def louvain(
    adata: AnnData,
    resolution: Optional[float] = None,
    random_state: Optional[Union[int, RandomState]] = 0,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    key_added: Optional[str] = 'louvain',
    adjacency: Optional[spmatrix] = None,
    flavor: str = 'vtraag',
    directed: bool = True,
    use_weights: bool = False,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    partition_kwargs: Optional[Mapping[str, Any]] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_.

    Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation
    of [Traag17]_. The Louvain algorithm has been proposed for single-cell
    analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first,
    or explicitly passing a ``adjacency`` matrix.

    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        For the default flavor (``'vtraag'``), you can provide a resolution
        (higher resolution means finding more and smaller clusters),
        which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_.
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain ``(obs_key, list_of_categories)``.
    key_added
        Key under which to add the cluster labels. (default: ``'louvain'``)
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        ``adata.uns['neighbors']['connectivities']``.
    flavor : {``'vtraag'``, ``'igraph'``, ``'rapids'``}
        Choose between to packages for computing the clustering.
        ``'vtraag'`` is much more powerful, and the default.
    directed
        Interpret the ``adjacency`` matrix as directed graph?
    use_weights
        Use weights from knn graph.
    partition_type
        Type of partition to use.
        Only a valid argument if ``flavor`` is ``'vtraag'``.
    partition_kwargs
        Key word arguments to pass to partitioning,
        if ``vtraag`` method is being used.
    copy
        Copy adata or modify it inplace.

    Returns
    -------
    :obj:`None`
        By default (``copy=False``), updates ``adata`` with the following fields:

        ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``)
            Array of dim (number of samples) that stores the subgroup id
            (``'0'``, ``'1'``, ...) for each cell.

    :class:`~anndata.AnnData`
        When ``copy=True`` is set, a copy of ``adata`` with those fields is returned.
    """
    start = logg.info('running Louvain clustering')
    if (flavor != 'vtraag') and (partition_type is not None):
        raise ValueError(
            '`partition_type` is only a valid argument when `flavour` is "vtraag"'
        )
    adata = adata.copy() if copy else adata
    if adjacency is None and 'neighbors' not in adata.uns:
        raise ValueError(
            'You need to run `pp.neighbors` first to compute a neighborhood graph.'
        )
    if adjacency is None:
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    if flavor in {'vtraag', 'igraph'}:
        if flavor == 'igraph' and resolution is not None:
            logg.warning(
                '`resolution` parameter has no effect for flavor "igraph"')
        if directed and flavor == 'igraph':
            directed = False
        if not directed: logg.debug('    using the undirected graph')
        g = _utils.get_igraph_from_adjacency(adjacency, directed=directed)
        if use_weights:
            weights = np.array(g.es["weight"]).astype(np.float64)
        else:
            weights = None
        if flavor == 'vtraag':
            import louvain
            if partition_kwargs is None:
                partition_kwargs = {}
            if partition_type is None:
                partition_type = louvain.RBConfigurationVertexPartition
            if resolution is not None:
                partition_kwargs["resolution_parameter"] = resolution
            if use_weights:
                partition_kwargs["weights"] = weights
            logg.info('    using the "louvain" package of Traag (2017)')
            louvain.set_rng_seed(random_state)
            part = louvain.find_partition(
                g,
                partition_type,
                **partition_kwargs,
            )
            # adata.uns['louvain_quality'] = part.quality()
        else:
            part = g.community_multilevel(weights=weights)
        groups = np.array(part.membership)
    elif flavor == 'rapids':
        # nvLouvain only works with undirected graphs, and `adjacency` must have a directed edge in both directions
        import cudf
        import cugraph
        offsets = cudf.Series(adjacency.indptr)
        indices = cudf.Series(adjacency.indices)
        if use_weights:
            sources, targets = adjacency.nonzero()
            weights = adjacency[sources, targets]
            if isinstance(weights, np.matrix):
                weights = weights.A1
            weights = cudf.Series(weights)
        else:
            weights = None
        g = cugraph.Graph()
        g.add_adj_list(offsets, indices, weights)
        logg.info('    using the "louvain" package of rapids')
        louvain_parts, _ = cugraph.nvLouvain(g)
        groups = louvain_parts.to_pandas().sort_values('vertex')[[
            'partition'
        ]].to_numpy().ravel()
    elif flavor == 'taynaud':
        # this is deprecated
        import networkx as nx
        import community
        g = nx.Graph(adjacency)
        partition = community.best_partition(g)
        groups = np.zeros(len(partition), dtype=int)
        for k, v in partition.items():
            groups[k] = v
    else:
        raise ValueError(
            '`flavor` needs to be "vtraag" or "igraph" or "taynaud".')
    if restrict_to is not None:
        if key_added == 'louvain':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(np.unique(groups).astype('U')),
    )
    adata.uns['louvain'] = {}
    adata.uns['louvain']['params'] = {
        'resolution': resolution,
        'random_state': random_state
    }
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Esempio n. 17
0
def embedding_density(adata: AnnData,
                      basis: str,
                      key: str,
                      *,
                      group: Optional[str] = None,
                      color_map: Union[Colormap, str] = 'YlOrRd',
                      bg_dotsize: Optional[int] = 80,
                      fg_dotsize: Optional[int] = 180,
                      vmax: Optional[int] = 1,
                      vmin: Optional[int] = 0,
                      save: Union[bool, str, None] = None,
                      **kwargs):
    """Plot the density of cells in an embedding (per condition)

    Plots the gaussian kernel density estimates (over condition) from the
    `sc.tl.embedding_density()` output.

    This function was written by Sophie Tritschler and implemented into
    Scanpy by Malte Luecken.
    
    Parameters
    ----------
    adata
        The annotated data matrix.
    basis
        The embedding over which the density was calculated. This embedded
        representation should be found in `adata.obsm['X_[basis]']``.
    key
        Name of the `.obs` covariate that contains the density estimates
    group
        The category in the categorical observation annotation to be plotted.
        For example, 'G1' in the cell cycle 'phase' covariate.
    color_map
        Matplolib color map to use for density plotting.
    bg_dotsize
        Dot size for background data points not in the `group`.
    fg_dotsize
        Dot size for foreground data points in the `group`.
    vmax
        Density that corresponds to color bar maximum.
    vmin
        Density that corresponds to color bar minimum.
    {show_save_ax}

    Examples
    --------
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.umap(adata)
    >>> sc.tl.embedding_density(adata, basis='umap', groupby='phase')
    >>> sc.pl.embedding_density(adata, basis='umap', key='umap_density_phase', 
    ...                         group='G1')
    >>> sc.pl.embedding_density(adata, basis='umap', key='umap_density_phase', 
    ...                         group='S')
    """
    sanitize_anndata(adata)

    # Test user inputs
    basis = basis.lower()

    if basis == 'fa':
        basis = 'draw_graph_fa'

    if 'X_' + basis not in adata.obsm_keys():
        raise ValueError(
            'Cannot find the embedded representation `adata.obsm[X_{!r}]`. '
            'Compute the embedding first.'.format(basis))

    if key not in adata.obs:
        raise ValueError(
            'Please run `sc.tl.embedding_density()` first and specify the correct key.'
        )

    if key + '_params' not in adata.uns:
        raise ValueError(
            'Please run `sc.tl.embedding_density()` first and specify the correct key.'
        )

    if 'components' in kwargs:
        logg.warn('Components were specified, but will be ignored. Only the '
                  'components used to calculate the density can be plotted.')
        del kwargs['components']

    components = adata.uns[key + '_params']['components']
    groupby = adata.uns[key + '_params']['covariate']

    if (group is None) and (groupby is not None):
        raise ValueError('Densities were calculated over an `.obs` covariate. '
                         'Please specify a group from this covariate to plot.')

    if (group is not None) and (group
                                not in adata.obs[groupby].cat.categories):
        raise ValueError(
            'Please specify a group from the `.obs` category over which the density '
            'was calculated.')

    if (np.min(adata.obs[key]) < 0) or (np.max(adata.obs[key]) > 1):
        raise ValueError('Densities should be scaled between 0 and 1.')

    # Define plotting data
    dens_values = -np.ones(adata.n_obs)
    dot_sizes = np.ones(adata.n_obs) * bg_dotsize

    if group is not None:
        group_mask = (adata.obs[groupby] == group)
        dens_values[group_mask] = adata.obs[key][group_mask]
        dot_sizes[group_mask] = np.ones(sum(group_mask)) * fg_dotsize

    else:
        dens_values = adata.obs[key]
        dot_sizes = np.ones(adata.n_obs) * fg_dotsize

    # Make the color map
    if isinstance(color_map, str):
        cmap = cm.get_cmap(color_map)
    else:
        cmap = color_map

    #norm = colors.Normalize(vmin=-1, vmax=1)
    adata_vis = adata.copy()
    adata_vis.obs['Density'] = dens_values

    norm = colors.Normalize(vmin=vmin, vmax=vmax)
    cmap.set_over('black')
    cmap.set_under('lightgray')

    # Ensure title is blank as default
    if 'title' not in kwargs:
        title = ""
    else:
        title = kwargs.pop('title')

    # Plot the graph
    return plot_scatter(adata_vis,
                        basis,
                        components=components,
                        color='Density',
                        color_map=cmap,
                        norm=norm,
                        size=dot_sizes,
                        vmax=vmax,
                        vmin=vmin,
                        save=save,
                        title=title,
                        **kwargs)
Esempio n. 18
0
def score_genes(
    adata: AnnData,
    gene_list: Sequence[str],
    ctrl_size: int = 50,
    gene_pool: Optional[Sequence[str]] = None,
    n_bins: int = 25,
    score_name: str = 'score',
    random_state: Optional[Union[int, RandomState]] = 0,
    copy: bool = False,
    use_raw: bool = False,
):  # we use the scikit-learn convention of calling the seed "random_state"
    """Score a set of genes [Satija15]_.

    The score is the average expression of a set of genes subtracted with the
    average expression of a reference set of genes. The reference set is
    randomly sampled from the `gene_pool` for each binned expression value.

    This reproduces the approach in Seurat [Satija15]_ and has been implemented
    for Scanpy by Davide Cittaro.

    Parameters
    ----------
    adata
        The annotated data matrix.
    gene_list
        The list of gene names used for score calculation.
    ctrl_size
        Number of reference genes to be sampled. If `len(gene_list)` is not too
        low, you can set `ctrl_size=len(gene_list)`.
    gene_pool
        Genes for sampling the reference set. Default is all genes.
    n_bins
        Number of expression level bins for sampling.
    score_name
        Name of the field to be added in `.obs`.
    random_state
        The random seed for sampling.
    copy
        Copy `adata` or modify it inplace.
    use_raw
        Use `raw` attribute of `adata` if present.
    Returns
    -------
    Depending on `copy`, returns or updates `adata` with an additional field
    `score_name`.

    Examples
    --------
    See this `notebook <https://github.com/theislab/scanpy_usage/tree/master/180209_cell_cycle>`__.
    """
    start = logg.info(f'computing score {score_name!r}')
    adata = adata.copy() if copy else adata

    if random_state is not None:
        np.random.seed(random_state)

    gene_list_in_var = []
    var_names = adata.raw.var_names if use_raw else adata.var_names
    for gene in gene_list:
        if gene in var_names:
            gene_list_in_var.append(gene)
        else:
            logg.warning(
                f'gene: {gene} is not in adata.var_names and will be ignored')
    gene_list = set(gene_list_in_var[:])

    if not gene_pool:
        gene_pool = list(var_names)
    else:
        gene_pool = [x for x in gene_pool if x in var_names]

    # Trying here to match the Seurat approach in scoring cells.
    # Basically we need to compare genes against random genes in a matched
    # interval of expression.

    _adata = adata.raw if use_raw else adata
    # TODO: this densifies the whole data matrix for `gene_pool`
    if issparse(_adata.X):
        obs_avg = pd.Series(np.nanmean(_adata[:, gene_pool].X.toarray(),
                                       axis=0),
                            index=gene_pool)  # average expression of genes
    else:
        obs_avg = pd.Series(np.nanmean(_adata[:, gene_pool].X, axis=0),
                            index=gene_pool)  # average expression of genes

    obs_avg = obs_avg[np.isfinite(
        obs_avg
    )]  # Sometimes (and I don't know how) missing data may be there, with nansfor

    n_items = int(np.round(len(obs_avg) / (n_bins - 1)))
    obs_cut = obs_avg.rank(method='min') // n_items
    control_genes = set()

    # now pick `ctrl_size` genes from every cut
    for cut in np.unique(obs_cut.loc[gene_list]):
        r_genes = np.array(obs_cut[obs_cut == cut].index)
        np.random.shuffle(r_genes)
        control_genes.update(set(r_genes[:ctrl_size])
                             )  # uses full r_genes if ctrl_size > len(r_genes)

    # To index, we need a list - indexing implies an order.
    control_genes = list(control_genes - gene_list)
    gene_list = list(gene_list)

    X_list = _adata[:, gene_list].X
    if issparse(X_list): X_list = X_list.toarray()
    X_control = _adata[:, control_genes].X
    if issparse(X_control): X_control = X_control.toarray()
    X_control = np.nanmean(X_control, axis=1)

    if len(gene_list) == 0:
        # We shouldn't even get here, but just in case
        logg.hint(f'could not add \n'
                  f'    {score_name!r}, score of gene set (adata.obs)')
        return adata if copy else None
    elif len(gene_list) == 1:
        score = _adata[:, gene_list].X - X_control
    else:
        score = np.nanmean(X_list, axis=1) - X_control

    adata.obs[score_name] = pd.Series(np.array(score).ravel(),
                                      index=adata.obs_names)

    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              f'    {score_name!r}, score of gene set (adata.obs)'),
    )
    return adata if copy else None
Esempio n. 19
0
def score_genes_cell_cycle(
    adata: AnnData,
    s_genes: Sequence[str],
    g2m_genes: Sequence[str],
    copy: bool = False,
    **kwargs,
) -> Optional[AnnData]:
    """\
    Score cell cycle genes [Satija15]_.

    Given two lists of genes associated to S phase and G2M phase, calculates
    scores and assigns a cell cycle phase (G1, S or G2M). See
    :func:`~scanpy.tl.score_genes` for more explanation.

    Parameters
    ----------
    adata
        The annotated data matrix.
    s_genes
        List of genes associated with S phase.
    g2m_genes
        List of genes associated with G2M phase.
    copy
        Copy `adata` or modify it inplace.
    **kwargs
        Are passed to :func:`~scanpy.tl.score_genes`. `ctrl_size` is not
        possible, as it's set as `min(len(s_genes), len(g2m_genes))`.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **S_score** : `adata.obs`, dtype `object`
        The score for S phase for each cell.
    **G2M_score** : `adata.obs`, dtype `object`
        The score for G2M phase for each cell.
    **phase** : `adata.obs`, dtype `object`
        The cell cycle phase (`S`, `G2M` or `G1`) for each cell.

    See also
    --------
    score_genes

    Examples
    --------
    See this `notebook <https://github.com/scverse/scanpy_usage/tree/master/180209_cell_cycle>`__.
    """
    logg.info('calculating cell cycle phase')

    adata = adata.copy() if copy else adata
    ctrl_size = min(len(s_genes), len(g2m_genes))
    # add s-score
    score_genes(
        adata, gene_list=s_genes, score_name='S_score', ctrl_size=ctrl_size, **kwargs
    )
    # add g2m-score
    score_genes(
        adata,
        gene_list=g2m_genes,
        score_name='G2M_score',
        ctrl_size=ctrl_size,
        **kwargs,
    )
    scores = adata.obs[['S_score', 'G2M_score']]

    # default phase is S
    phase = pd.Series('S', index=scores.index)

    # if G2M is higher than S, it's G2M
    phase[scores.G2M_score > scores.S_score] = 'G2M'

    # if all scores are negative, it's G1...
    phase[np.all(scores < 0, axis=1)] = 'G1'

    adata.obs['phase'] = phase
    logg.hint('    \'phase\', cell cycle phase (adata.obs)')
    return adata if copy else None
Esempio n. 20
0
def rank_genes_groups(
    adata: AnnData,
    groupby: str,
    use_raw: Optional[bool] = None,
    groups: Union[Literal['all'], Iterable[str]] = 'all',
    reference: str = 'rest',
    n_genes: Optional[int] = None,
    rankby_abs: bool = False,
    pts: bool = False,
    key_added: Optional[str] = None,
    copy: bool = False,
    method: _Method = None,
    corr_method: _CorrMethod = 'benjamini-hochberg',
    tie_correct: bool = False,
    layer: Optional[str] = None,
    **kwds,
) -> Optional[AnnData]:
    """\
    Rank genes for characterizing groups.

    Expects logarithmized data.

    Parameters
    ----------
    adata
        Annotated data matrix.
    groupby
        The key of the observations grouping to consider.
    use_raw
        Use `raw` attribute of `adata` if present.
    layer
        Key from `adata.layers` whose value will be used to perform tests on.
    groups
        Subset of groups, e.g. [`'g1'`, `'g2'`, `'g3'`], to which comparison
        shall be restricted, or `'all'` (default), for all groups.
    reference
        If `'rest'`, compare each group to the union of the rest of the group.
        If a group identifier, compare with respect to this group.
    n_genes
        The number of genes that appear in the returned tables.
        Defaults to all genes.
    method
        The default method is `'t-test'`,
        `'t-test_overestim_var'` overestimates variance of each group,
        `'wilcoxon'` uses Wilcoxon rank-sum,
        `'logreg'` uses logistic regression. See [Ntranos18]_,
        `here <https://github.com/scverse/scanpy/issues/95>`__ and `here
        <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__,
        for why this is meaningful.
    corr_method
        p-value correction method.
        Used only for `'t-test'`, `'t-test_overestim_var'`, and `'wilcoxon'`.
    tie_correct
        Use tie correction for `'wilcoxon'` scores.
        Used only for `'wilcoxon'`.
    rankby_abs
        Rank genes by the absolute value of the score, not by the
        score. The returned scores are never the absolute values.
    pts
        Compute the fraction of cells expressing the genes.
    key_added
        The key in `adata.uns` information is saved to.
    **kwds
        Are passed to test methods. Currently this affects only parameters that
        are passed to :class:`sklearn.linear_model.LogisticRegression`.
        For instance, you can pass `penalty='l1'` to try to come up with a
        minimal set of genes that are good predictors (sparse solution meaning
        few non-zero fitted coefficients).

    Returns
    -------
    **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the gene
        names. Ordered according to scores.
    **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the z-score
        underlying the computation of a p-value for each gene for each
        group. Ordered according to scores.
    **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the log2
        fold change for each gene for each group. Ordered according to
        scores. Only provided if method is 't-test' like.
        Note: this is an approximation calculated from mean-log values.
    **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        p-values.
    **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Corrected p-values.
    **pts** : `pandas.DataFrame` (`.uns['rank_genes_groups']`)
        Fraction of cells expressing the genes for each group.
    **pts_rest** : `pandas.DataFrame` (`.uns['rank_genes_groups']`)
        Only if `reference` is set to `'rest'`.
        Fraction of cells from the union of the rest of each group
        expressing the genes.

    Notes
    -----
    There are slight inconsistencies depending on whether sparse
    or dense data are passed. See `here <https://github.com/scverse/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon')
    >>> # to visualize the results
    >>> sc.pl.rank_genes_groups(adata)
    """
    if use_raw is None:
        use_raw = adata.raw is not None
    elif use_raw is True and adata.raw is None:
        raise ValueError("Received `use_raw=True`, but `adata.raw` is empty.")

    if method is None:
        logg.warning(
            "Default of the method has been changed to 't-test' from 't-test_overestim_var'"
        )
        method = 't-test'

    if 'only_positive' in kwds:
        rankby_abs = not kwds.pop('only_positive')  # backwards compat

    start = logg.info('ranking genes')
    avail_methods = {'t-test', 't-test_overestim_var', 'wilcoxon', 'logreg'}
    if method not in avail_methods:
        raise ValueError(f'Method must be one of {avail_methods}.')

    avail_corr = {'benjamini-hochberg', 'bonferroni'}
    if corr_method not in avail_corr:
        raise ValueError(f'Correction method must be one of {avail_corr}.')

    adata = adata.copy() if copy else adata
    _utils.sanitize_anndata(adata)
    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if reference != 'rest' and reference not in adata.obs[
            groupby].cat.categories:
        cats = adata.obs[groupby].cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    if key_added is None:
        key_added = 'rank_genes_groups'
    adata.uns[key_added] = {}
    adata.uns[key_added]['params'] = dict(
        groupby=groupby,
        reference=reference,
        method=method,
        use_raw=use_raw,
        layer=layer,
        corr_method=corr_method,
    )

    test_obj = _RankGenes(adata, groups_order, groupby, reference, use_raw,
                          layer, pts)

    if check_nonnegative_integers(test_obj.X) and method != 'logreg':
        logg.warning(
            "It seems you use rank_genes_groups on the raw count data. "
            "Please logarithmize your data before calling rank_genes_groups.")

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    # defaults to all genes
    if n_genes_user is None or n_genes_user > test_obj.X.shape[1]:
        n_genes_user = test_obj.X.shape[1]

    logg.debug(f'consider {groupby!r} groups:')
    logg.debug(
        f'with sizes: {np.count_nonzero(test_obj.groups_masks, axis=1)}')

    test_obj.compute_statistics(method, corr_method, n_genes_user, rankby_abs,
                                tie_correct, **kwds)

    if test_obj.pts is not None:
        groups_names = [str(name) for name in test_obj.groups_order]
        adata.uns[key_added]['pts'] = pd.DataFrame(test_obj.pts.T,
                                                   index=test_obj.var_names,
                                                   columns=groups_names)
    if test_obj.pts_rest is not None:
        adata.uns[key_added]['pts_rest'] = pd.DataFrame(
            test_obj.pts_rest.T,
            index=test_obj.var_names,
            columns=groups_names)

    test_obj.stats.columns = test_obj.stats.columns.swaplevel()

    dtypes = {
        'names': 'O',
        'scores': 'float32',
        'logfoldchanges': 'float32',
        'pvals': 'float64',
        'pvals_adj': 'float64',
    }

    for col in test_obj.stats.columns.levels[0]:
        adata.uns[key_added][col] = test_obj.stats[col].to_records(
            index=False, column_dtypes=dtypes[col])

    logg.info(
        '    finished',
        time=start,
        deep=
        (f'added to `.uns[{key_added!r}]`\n'
         "    'names', sorted np.recarray to be indexed by group ids\n"
         "    'scores', sorted np.recarray to be indexed by group ids\n" +
         ("    'logfoldchanges', sorted np.recarray to be indexed by group ids\n"
          "    'pvals', sorted np.recarray to be indexed by group ids\n"
          "    'pvals_adj', sorted np.recarray to be indexed by group ids" if
          method in {'t-test', 't-test_overestim_var', 'wilcoxon'} else '')),
    )
    return adata if copy else None
Esempio n. 21
0
def dpt(
    adata: AnnData,
    n_dcs: int = 10,
    n_branchings: int = 0,
    min_group_size: float = 0.01,
    allow_kendall_tau_shift: bool = True,
    neighbors_key: Optional[str] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Infer progression of cells through geodesic distance along the graph
    [Haghverdi16]_ [Wolf19]_.

    Reconstruct the progression of a biological process from snapshot
    data. `Diffusion Pseudotime` has been introduced by [Haghverdi16]_ and
    implemented within Scanpy [Wolf18]_. Here, we use a further developed
    version, which is able to deal with disconnected graphs [Wolf19]_ and can
    be run in a `hierarchical` mode by setting the parameter
    `n_branchings>1`. We recommend, however, to only use
    :func:`~scanpy.tl.dpt` for computing pseudotime (`n_branchings=0`) and
    to detect branchings via :func:`~scanpy.tl.paga`. For pseudotime, you need
    to annotate your data with a root cell. For instance::

        adata.uns['iroot'] = np.flatnonzero(adata.obs['cell_types'] == 'Stem')[0]

    This requires to run :func:`~scanpy.pp.neighbors`, first. In order to
    reproduce the original implementation of DPT, use `method=='gauss'` in
    this. Using the default `method=='umap'` only leads to minor quantitative
    differences, though.

    .. versionadded:: 1.1

    :func:`~scanpy.tl.dpt` also requires to run
    :func:`~scanpy.tl.diffmap` first. As previously,
    :func:`~scanpy.tl.dpt` came with a default parameter of ``n_dcs=10`` but
    :func:`~scanpy.tl.diffmap` has a default parameter of ``n_comps=15``,
    you need to pass ``n_comps=10`` in :func:`~scanpy.tl.diffmap` in order
    to exactly reproduce previous :func:`~scanpy.tl.dpt` results.

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_dcs
        The number of diffusion components to use.
    n_branchings
        Number of branchings to detect.
    min_group_size
        During recursive splitting of branches ('dpt groups') for `n_branchings`
        > 1, do not consider groups that contain less than `min_group_size` data
        points. If a float, `min_group_size` refers to a fraction of the total
        number of data points.
    allow_kendall_tau_shift
        If a very small branch is detected upon splitting, shift away from
        maximum correlation in Kendall tau criterion of [Haghverdi16]_ to
        stabilize the splitting.
    neighbors_key
        If not specified, dpt looks .uns['neighbors'] for neighbors settings
        and .obsp['connectivities'], .obsp['distances'] for connectivities and
        distances respectively (default storage places for pp.neighbors).
        If specified, dpt looks .uns[neighbors_key] for neighbors settings and
        .obsp[.uns[neighbors_key]['connectivities_key']],
        .obsp[.uns[neighbors_key]['distances_key']] for connectivities and distances
        respectively.
    copy
        Copy instance before computation and return a copy.
        Otherwise, perform computation inplace and return `None`.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    If `n_branchings==0`, no field `dpt_groups` will be written.

    `dpt_pseudotime` : :class:`pandas.Series` (`adata.obs`, dtype `float`)
        Array of dim (number of samples) that stores the pseudotime of each
        cell, that is, the DPT distance with respect to the root cell.
    `dpt_groups` : :class:`pandas.Series` (`adata.obs`, dtype `category`)
        Array of dim (number of samples) that stores the subgroup id ('0',
        '1', ...) for each cell. The groups  typically correspond to
        'progenitor cells', 'undecided cells' or 'branches' of a process.

    Notes
    -----
    The tool is similar to the R package `destiny` of [Angerer16]_.
    """
    # standard errors, warnings etc.
    adata = adata.copy() if copy else adata

    if neighbors_key is None:
        neighbors_key = 'neighbors'
    if neighbors_key not in adata.uns:
        raise ValueError(
            'You need to run `pp.neighbors` and `tl.diffmap` first.')
    if 'iroot' not in adata.uns and 'xroot' not in adata.var:
        logg.warning(
            'No root cell found. To compute pseudotime, pass the index or '
            'expression vector of a root cell, one of:\n'
            '    adata.uns[\'iroot\'] = root_cell_index\n'
            '    adata.var[\'xroot\'] = adata[root_cell_name, :].X')
    if 'X_diffmap' not in adata.obsm.keys():
        logg.warning(
            'Trying to run `tl.dpt` without prior call of `tl.diffmap`. '
            'Falling back to `tl.diffmap` with default parameters.')
        _diffmap(adata, neighbors_key=neighbors_key)
    # start with the actual computation
    dpt = DPT(adata,
              n_dcs=n_dcs,
              min_group_size=min_group_size,
              n_branchings=n_branchings,
              allow_kendall_tau_shift=allow_kendall_tau_shift,
              neighbors_key=neighbors_key)
    start = logg.info(f'computing Diffusion Pseudotime using n_dcs={n_dcs}')
    if n_branchings > 1:
        logg.info('    this uses a hierarchical implementation')
    if dpt.iroot is not None:
        dpt._set_pseudotime()  # pseudotimes are distances from root point
        adata.uns[
            'iroot'] = dpt.iroot  # update iroot, might have changed when subsampling, for example
        adata.obs['dpt_pseudotime'] = dpt.pseudotime
    # detect branchings and partition the data into segments
    if n_branchings > 0:
        dpt.branchings_segments()
        adata.obs['dpt_groups'] = pd.Categorical(
            values=dpt.segs_names.astype('U'),
            categories=natsorted(np.array(dpt.segs_names_unique).astype('U')))
        # the "change points" separate segments in the ordering above
        adata.uns['dpt_changepoints'] = dpt.changepoints
        # the tip points of segments
        adata.uns['dpt_grouptips'] = dpt.segs_tips
        # the ordering according to segments and pseudotime
        ordering_id = np.zeros(adata.n_obs, dtype=int)
        for count, idx in enumerate(dpt.indices):
            ordering_id[idx] = count
        adata.obs['dpt_order'] = ordering_id
        adata.obs['dpt_order_indices'] = dpt.indices
    logg.info(
        '    finished',
        time=start,
        deep=('added\n' + ("    'dpt_pseudotime', the pseudotime (adata.obs)"
                           if dpt.iroot is not None else '') +
              ("\n    'dpt_groups', the branching subgroups of dpt (adata.obs)"
               "\n    'dpt_order', cell order (adata.obs)"
               if n_branchings > 0 else '')),
    )
    return adata if copy else None
Esempio n. 22
0
def filter_genes(
    data: AnnData,
    min_counts: Optional[int] = None,
    min_cells: Optional[int] = None,
    max_counts: Optional[int] = None,
    max_cells: Optional[int] = None,
    inplace: bool = True,
    copy: bool = False,
) -> Union[AnnData, None, Tuple[np.ndarray, np.ndarray]]:
    """\
    Filter genes based on number of cells or counts.

    Keep genes that have at least `min_counts` counts or are expressed in at
    least `min_cells` cells or have at most `max_counts` counts or are expressed
    in at most `max_cells` cells.

    Only provide one of the optional parameters `min_counts`, `min_cells`,
    `max_counts`, `max_cells` per call.

    Parameters
    ----------
    data
        An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    min_counts
        Minimum number of counts required for a gene to pass filtering.
    min_cells
        Minimum number of cells expressed required for a gene to pass filtering.
    max_counts
        Maximum number of counts required for a gene to pass filtering.
    max_cells
        Maximum number of cells expressed required for a gene to pass filtering.
    inplace
        Perform computation inplace or return result.

    Returns
    -------
    Depending on `inplace`, returns the following arrays or directly subsets
    and annotates the data matrix

    gene_subset
        Boolean index mask that does filtering. `True` means that the
        gene is kept. `False` means the gene is removed.
    number_per_gene
        Depending on what was tresholded (`counts` or `cells`), the array stores
        `n_counts` or `n_cells` per gene.
    """
    if copy:
        logg.warning('`copy` is deprecated, use `inplace` instead.')
    n_given_options = sum(
        option is not None
        for option in [min_cells, min_counts, max_cells, max_counts])
    if n_given_options != 1:
        raise ValueError(
            'Only provide one of the optional parameters `min_counts`, '
            '`min_cells`, `max_counts`, `max_cells` per call.')

    if isinstance(data, AnnData):
        adata = data.copy() if copy else data
        gene_subset, number = materialize_as_ndarray(
            filter_genes(adata.X,
                         min_cells=min_cells,
                         min_counts=min_counts,
                         max_cells=max_cells,
                         max_counts=max_counts))
        if not inplace:
            return gene_subset, number
        if min_cells is None and max_cells is None:
            adata.var['n_counts'] = number
        else:
            adata.var['n_cells'] = number
        adata._inplace_subset_var(gene_subset)
        return adata if copy else None

    X = data  # proceed with processing the data matrix
    min_number = min_counts if min_cells is None else min_cells
    max_number = max_counts if max_cells is None else max_cells
    number_per_gene = np.sum(
        X if min_cells is None and max_cells is None else X > 0, axis=0)
    if issparse(X):
        number_per_gene = number_per_gene.A1
    if min_number is not None:
        gene_subset = number_per_gene >= min_number
    if max_number is not None:
        gene_subset = number_per_gene <= max_number

    s = np.sum(~gene_subset)
    if s > 0:
        msg = f'filtered out {s} genes that are detected '
        if min_cells is not None or min_counts is not None:
            msg += 'in less than '
            msg += f'{min_cells} cells' if min_counts is None else f'{min_counts} counts'
        if max_cells is not None or max_counts is not None:
            msg += 'in more than '
            msg += f'{max_cells} cells' if max_counts is None else f'{max_counts} counts'
        logg.info(msg)
    return gene_subset, number_per_gene
Esempio n. 23
0
def test_recipe_plotting():
    sc.settings.autoshow = False
    adata = AnnData(np.random.randint(0, 1000, (1000, 1000)))
    # These shouldn't throw an error
    sc.pp.recipe_seurat(adata.copy(), plot=True)
    sc.pp.recipe_zheng17(adata.copy(), plot=True)
Esempio n. 24
0
def filter_cells(
    data: AnnData,
    min_counts: Optional[int] = None,
    min_genes: Optional[int] = None,
    max_counts: Optional[int] = None,
    max_genes: Optional[int] = None,
    inplace: bool = True,
    copy: bool = False,
) -> Optional[Tuple[np.ndarray, np.ndarray]]:
    """\
    Filter cell outliers based on counts and numbers of genes expressed.

    For instance, only keep cells with at least `min_counts` counts or
    `min_genes` genes expressed. This is to filter measurement outliers,
    i.e. “unreliable” observations.

    Only provide one of the optional parameters `min_counts`, `min_genes`,
    `max_counts`, `max_genes` per call.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape `n_obs` × `n_vars`.
        Rows correspond to cells and columns to genes.
    min_counts
        Minimum number of counts required for a cell to pass filtering.
    min_genes
        Minimum number of genes expressed required for a cell to pass filtering.
    max_counts
        Maximum number of counts required for a cell to pass filtering.
    max_genes
        Maximum number of genes expressed required for a cell to pass filtering.
    inplace
        Perform computation inplace or return result.

    Returns
    -------
    Depending on `inplace`, returns the following arrays or directly subsets
    and annotates the data matrix:

    cells_subset
        Boolean index mask that does filtering. `True` means that the
        cell is kept. `False` means the cell is removed.
    number_per_cell
        Depending on what was tresholded (`counts` or `genes`),
        the array stores `n_counts` or `n_cells` per gene.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.krumsiek11()
    >>> adata.n_obs
    640
    >>> adata.var_names
    ['Gata2' 'Gata1' 'Fog1' 'EKLF' 'Fli1' 'SCL' 'Cebpa'
     'Pu.1' 'cJun' 'EgrNab' 'Gfi1']
    >>> # add some true zeros
    >>> adata.X[adata.X < 0.3] = 0
    >>> # simply compute the number of genes per cell
    >>> sc.pp.filter_cells(adata, min_genes=0)
    >>> adata.n_obs
    640
    >>> adata.obs['n_genes'].min()
    1
    >>> # filter manually
    >>> adata_copy = adata[adata.obs['n_genes'] >= 3]
    >>> adata_copy.obs['n_genes'].min()
    >>> adata.n_obs
    554
    >>> adata.obs['n_genes'].min()
    3
    >>> # actually do some filtering
    >>> sc.pp.filter_cells(adata, min_genes=3)
    >>> adata.n_obs
    554
    >>> adata.obs['n_genes'].min()
    3
    """
    if copy:
        logg.warning('`copy` is deprecated, use `inplace` instead.')
    n_given_options = sum(
        option is not None
        for option in [min_genes, min_counts, max_genes, max_counts])
    if n_given_options != 1:
        raise ValueError(
            'Only provide one of the optional parameters `min_counts`, '
            '`min_genes`, `max_counts`, `max_genes` per call.')
    if isinstance(data, AnnData):
        adata = data.copy() if copy else data
        cell_subset, number = materialize_as_ndarray(
            filter_cells(adata.X, min_counts, min_genes, max_counts,
                         max_genes))
        if not inplace:
            return cell_subset, number
        if min_genes is None and max_genes is None:
            adata.obs['n_counts'] = number
        else:
            adata.obs['n_genes'] = number
        adata._inplace_subset_obs(cell_subset)
        return adata if copy else None
    X = data  # proceed with processing the data matrix
    min_number = min_counts if min_genes is None else min_genes
    max_number = max_counts if max_genes is None else max_genes
    number_per_cell = np.sum(
        X if min_genes is None and max_genes is None else X > 0, axis=1)
    if issparse(X): number_per_cell = number_per_cell.A1
    if min_number is not None:
        cell_subset = number_per_cell >= min_number
    if max_number is not None:
        cell_subset = number_per_cell <= max_number

    s = np.sum(~cell_subset)
    if s > 0:
        msg = f'filtered out {s} cells that have '
        if min_genes is not None or min_counts is not None:
            msg += 'less than '
            msg += f'{min_genes} genes expressed' if min_counts is None else f'{min_counts} counts'
        if max_genes is not None or max_counts is not None:
            msg += 'more than '
            msg += f'{max_genes} genes expressed' if max_counts is None else f'{max_counts} counts'
        logg.info(msg)
    return cell_subset, number_per_cell
Esempio n. 25
0
def neighbors(adata: AnnData,
              n_neighbors: int = 15,
              n_pcs: Optional[int] = None,
              use_rep: Optional[str] = None,
              knn: bool = True,
              random_state: Optional[Union[int, RandomState]] = 0,
              method: str = 'umap',
              metric: Union[str, Metric] = 'euclidean',
              metric_kwds: Mapping[str, Any] = {},
              copy: bool = False) -> Optional[AnnData]:
    """\
    Compute a neighborhood graph of observations [McInnes18]_.

    The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_,
    which also provides a method for estimating connectivities of data points -
    the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`,
    connectivities are computed according to [Coifman05]_, in the adaption of
    [Haghverdi16]_.

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    {n_pcs}
    {use_rep}
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'gauss', `None`}}  (default: `'umap'`)
        Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_
        with adaptive width [Haghverdi16]_) for computing connectivities.
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    Depending on `copy`, updates or returns `adata` with the following:

    **connectivities** : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    **distances** : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Instead of decaying weights, this stores distances for each pair of
        neighbors.
    """
    start = logg.info('computing neighbors')
    adata = adata.copy() if copy else adata
    if adata.isview:  # we shouldn't need this here...
        adata._init_as_actual(adata.copy())
    neighbors = Neighbors(adata)
    neighbors.compute_neighbors(
        n_neighbors=n_neighbors,
        knn=knn,
        n_pcs=n_pcs,
        use_rep=use_rep,
        method=method,
        metric=metric,
        metric_kwds=metric_kwds,
        random_state=random_state,
    )
    adata.uns['neighbors'] = {}
    adata.uns['neighbors']['params'] = {
        'n_neighbors': n_neighbors,
        'method': method
    }
    adata.uns['neighbors']['params']['metric'] = metric
    if metric_kwds:
        adata.uns['neighbors']['params']['metric_kwds'] = metric_kwds
    if use_rep is not None:
        adata.uns['neighbors']['params']['use_rep'] = use_rep
    if n_pcs is not None:
        adata.uns['neighbors']['params']['n_pcs'] = n_pcs
    adata.uns['neighbors']['distances'] = neighbors.distances
    adata.uns['neighbors']['connectivities'] = neighbors.connectivities
    if neighbors.rp_forest is not None:
        adata.uns['neighbors']['rp_forest'] = neighbors.rp_forest
    logg.info(
        '    finished',
        time=start,
        deep=('added to `.uns[\'neighbors\']`\n'
              '    \'distances\', distances for each pair of neighbors\n'
              '    \'connectivities\', weighted adjacency matrix'),
    )
    return adata if copy else None
Esempio n. 26
0
def modules(
    adata: AnnData,
    root_milestone,
    milestones,
    color: str = "milestones",
    show_traj: bool = False,
    layer: Optional[str] = None,
    smooth: bool = False,
    show: Optional[bool] = None,
    save: Union[str, bool, None] = None,
    **kwargs,
):
    """\
    Plot the mean expression of the early and late modules.

    Parameters
    ----------
    adata
        Annotated data matrix.
    root_milestone
        tip defining progenitor branch.
    milestones
        tips defining the progenies branches.
    color
        color the cells with variable from adata.obs.
    show_traj
        show trajectory on the early module plot.
    layer
        layer to use to compute mean of module.
    show
        show the plot.
    save
        save the plot.
    kwargs
        arguments to pass to :func:`scFates.pl.trajectory` if `show_traj=True`, else to :func:`scanpy.pl.embedding`

    Returns
    -------
    If `show==False` a tuple of :class:`~matplotlib.axes.Axes`

    """

    plt.rcParams["axes.grid"] = False
    graph = adata.uns["graph"]

    dct = graph["milestones"]

    leaves = list(map(lambda leave: dct[leave], milestones))
    root = dct[root_milestone]

    name = root_milestone + "->" + "<>".join(milestones)

    stats = adata.uns[name]["fork"]

    if "milestones_colors" not in adata.uns or len(
            adata.uns["milestones_colors"]) == 1:
        from . import palette_tools

        palette_tools._set_default_colors_for_categorical_obs(
            adata, "milestones")

    mlsc = adata.uns["milestones_colors"].copy()
    mls = adata.obs.milestones.cat.categories.tolist()
    dct = dict(zip(mls, mlsc))
    df = adata.obs.copy(deep=True)
    edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple,
                                                              axis=1).values
    img = igraph.Graph()
    img.add_vertices(
        np.unique(graph["pp_seg"][["from",
                                   "to"]].values.flatten().astype(str)))
    img.add_edges(edges)

    cells = np.unique(
        np.concatenate([
            getpath(img, root, adata.uns["graph"]["tips"], leaves[0], graph,
                    df).index,
            getpath(img, root, adata.uns["graph"]["tips"], leaves[1], graph,
                    df).index,
        ]))

    if layer is None:
        if sparse.issparse(adata.X):
            X = pd.DataFrame(
                np.array(adata[:, stats.index].X.A),
                index=adata.obs_names,
                columns=stats.index,
            )
        else:
            X = pd.DataFrame(
                np.array(adata[:, stats.index].X),
                index=adata.obs_names,
                columns=stats.index,
            )
    else:
        if sparse.issparse(adata.layers[layer]):
            X = pd.DataFrame(
                np.array(adata[:, stats.index].layers[layer].A),
                index=adata.obs_names,
                columns=stats.index,
            )
        else:
            X = pd.DataFrame(
                np.array(adata[:, stats.index].layers[layer]),
                index=adata.obs_names,
                columns=stats.index,
            )

    early_1 = (stats.branch.values == milestones[0]) & (stats.module.values
                                                        == "early")
    late_1 = (stats.branch.values == milestones[0]) & (stats.module.values
                                                       == "late")

    early_2 = (stats.branch.values == milestones[1]) & (stats.module.values
                                                        == "early")
    late_2 = (stats.branch.values == milestones[1]) & (stats.module.values
                                                       == "late")

    X_early = pd.DataFrame({
        "early_" + milestones[0]:
        X.loc[:, early_1].mean(axis=1),
        "early_" + milestones[1]:
        X.loc[:, early_2].mean(axis=1),
    })

    X_late = pd.DataFrame({
        "late_" + milestones[0]:
        X.loc[:, late_1].mean(axis=1),
        "late_" + milestones[1]:
        X.loc[:, late_2].mean(axis=1),
    })

    adata_c = adata.copy()
    adata_c.obsm["X_early"] = X_early.values
    adata_c.obsm["X_late"] = X_late.values

    if smooth:
        adata_c.obsm["X_early"] = adata_c.obsp["connectivities"].dot(
            adata_c.obsm["X_early"])
        adata_c.obsm["X_late"] = adata_c.obsp["connectivities"].dot(
            adata_c.obsm["X_late"])

    axs, _, _, _ = setup_axes(panels=[0, 1])

    if show_traj:
        plot_trajectory(
            adata_c,
            basis="early",
            root_milestone=root_milestone,
            milestones=milestones,
            color_cells=color,
            show=False,
            title="",
            legend_loc="none",
            ax=axs[0],
            **kwargs,
        )
    else:
        sc.pl.embedding(
            adata_c[cells],
            basis="early",
            color=color,
            legend_loc="none",
            title="",
            show=False,
            ax=axs[0],
            **kwargs,
        )

    sc.pl.embedding(
        adata_c[cells],
        basis="late",
        color=color,
        legend_loc="none",
        show=False,
        title="",
        ax=axs[1],
        **kwargs,
    )

    axs[0].set_xlabel("early " + milestones[0])
    axs[0].set_ylabel("early " + milestones[1])
    axs[1].set_xlabel("late " + milestones[0])
    axs[1].set_ylabel("late " + milestones[1])

    savefig_or_show("modules", show=show, save=save)