Ejemplo n.º 1
0
def test_indices_dtypes():
    adata = AnnData(
        np.array([[1, 2, 3], [4, 5, 6]]),
        dict(obs_names=['A', 'B']),
        dict(var_names=['a', 'b', 'c']))
    adata.obs_names = ['ö', 'a']
    assert adata.obs_names.tolist() == ['ö', 'a']
Ejemplo n.º 2
0
def test_set_obs():
    adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]))

    adata.obs = pd.DataFrame({'a': [3, 4]})
    assert adata.obs_names.tolist() == [0, 1]

    from pytest import raises
    with raises(ValueError):
        adata.obs = pd.DataFrame({'a': [3, 4, 5]})
        adata.obs = {'a': [1, 2]}
Ejemplo n.º 3
0
def test_append_col():
    adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]))

    adata.obs['new'] = [1, 2]
    # this worked in the initial AnnData, but not with a dataframe
    # adata.obs[['new2', 'new3']] = [['A', 'B'], ['c', 'd']]

    from pytest import raises
    with raises(ValueError):
        adata.obs['new4'] = 'far too long'.split()
Ejemplo n.º 4
0
def test_transpose():
    adata = AnnData(
        np.array([[1, 2, 3], [4, 5, 6]]),
        dict(obs_names=['A', 'B']),
        dict(var_names=['a', 'b', 'c']))

    adata1 = adata.T

    # make sure to not modify the original!
    assert adata.obs_names.tolist() == ['A', 'B']
    assert adata.var_names.tolist() == ['a', 'b', 'c']

    assert adata1.obs_names.tolist() == ['a', 'b', 'c']
    assert adata1.var_names.tolist() == ['A', 'B']
    assert adata1.X.shape == adata.X.T.shape

    adata2 = adata.transpose()
    assert np.array_equal(adata1.X, adata2.X)
    assert np.array_equal(adata1.obs, adata2.obs)
    assert np.array_equal(adata1.var, adata2.var)
Ejemplo n.º 5
0
def test_rename_categories():
    X = np.ones((6, 3))
    obs = pd.DataFrame(
        {'cat_anno': pd.Categorical(['a', 'a', 'a', 'a', 'b', 'a'])})
    adata = AnnData(X=X, obs=obs)
    adata.uns['tool'] = {}
    adata.uns['tool']['cat_array'] = np.rec.fromarrays(
        [np.ones(2) for cat in adata.obs['cat_anno'].cat.categories],
        dtype=[(cat, 'float32') for cat in adata.obs['cat_anno'].cat.categories])
    adata.uns['tool']['params'] = {'groupby': 'cat_anno'}

    new_categories = ['c', 'd']
    adata.rename_categories('cat_anno', new_categories)

    assert list(adata.obs['cat_anno'].cat.categories) == new_categories
    assert list(adata.uns['tool']['cat_array'].dtype.names) == new_categories
Ejemplo n.º 6
0
def annotate_cells(
    adata: AnnData,
    dataset: str,
    cluster: Optional[str] = "louvain",
    n_top_genes: Optional[int] = 1000,
    max_cell_types: Optional[int] = 50,
    min_annotated: Optional[int] = 50,
    select: Optional[bool] = True,
) -> None:
    """
    Assign cells with cell type based on H3K27ac reference profiles.
    """
    # Determine relevant reference cell types.
    # All other cell types will not be used for motif activity and
    # cell type annotation.
    data = ScepiaDataset(dataset)
    gene_df = data.load_reference_data(reftype="gene")

    if select:
        cell_types = relevant_cell_types(
            adata,
            gene_df,
            cluster=cluster,
            n_top_genes=n_top_genes,
            max_cell_types=max_cell_types,
        )
    else:
        logger.info("Selecting all reference cell types.")
        cell_types = gene_df.columns

    if "scepia" not in adata.uns:
        adata.uns["scepia"] = {"version": __version__}

    adata.uns["scepia"]["cell_types"] = list(cell_types)

    logger.info("Annotating cells.")
    annotation_result, df_coef = annotate_with_k27(
        adata,
        gene_df[cell_types],
        cluster=cluster,
        use_neighbors=True,
        model="BayesianRidge",
        subsample=False,
        use_raw=False,
    )
    adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"]
                                           ["cell_types"]].values

    # Annotate by highest mean coefficient
    coefs = pd.DataFrame(adata.obsm["X_cell_types"],
                         index=adata.obs_names,
                         columns=cell_types)
    coefs["cluster"] = adata.obs[cluster]
    cluster_anno = (coefs.groupby("cluster").mean().idxmax(
        axis=1).to_frame("cluster_annotation"))

    if "cluster_annotation" in adata.obs:
        adata.obs = adata.obs.drop(columns=["cluster_annotation"])

    adata.obs = adata.obs.join(cluster_anno, on=cluster)

    # Second round of annotation, including "other"
    assign_cell_types(adata, min_annotated=min_annotated)
Ejemplo n.º 7
0
    def _generate_cost_matrices(
        self,
        adata: AnnData,
        cost_matrices: Optional[
            Union[str, Mapping[Tuple[float, float], np.ndarray]]
        ] = None,
    ) -> Tuple[Mapping[Tuple[float, float], Optional[np.ndarray]], str]:
        timepoints = self.experimental_time.cat.categories
        timepoints = list(zip(timepoints[:-1], timepoints[1:]))

        if cost_matrices is None:
            logg.info("Using default cost matrices")
            return {tpair: None for tpair in timepoints}, "default"

        if isinstance(cost_matrices, dict):
            logg.info("Using precomputed cost matrices")

            cmats = {}
            for tpair in timepoints:
                if tpair not in cost_matrices:
                    logg.warning(
                        f"Unable to find cost matrix for pair `{tpair}`. Using default"
                    )
                cmats[tpair] = cmat = cost_matrices.get(tpair, None)
                if cmat is not None:
                    n_start = len(np.where(self.experimental_time == tpair[0])[0])
                    n_end = len(np.where(self.experimental_time == tpair[1])[0])
                    try:
                        if cmat.shape != (n_start, n_end):
                            raise ValueError(
                                f"Expected cost matrix for time pair `{tpair}` to be "
                                f"of shape `{(n_start, n_end)}`, found `{cmat.shape}`."
                            )
                    except AttributeError:
                        logg.warning(
                            f"Unable to verify whether supplied cost matrix for time pair `{tpair}` "
                            f"has the correct shape `{(n_start, n_end)}`"
                        )

            # prevent equality comparison when comparing with cache
            return cmats, nstr("precomputed")

        if isinstance(cost_matrices, str):
            logg.info(f"Computing cost matrices using `{cost_matrices!r}` key")
            if cost_matrices == "X":
                cost_matrices = None

            try:
                features = adata._get_X(layer=cost_matrices)
                modifier = "layer"
            except KeyError:
                try:
                    features = adata.obsm[cost_matrices]
                    modifier = "obsm"
                except KeyError:
                    raise KeyError(
                        f"Unable to find key `{cost_matrices!r}` in `adata.layers` or `adata.obsm`."
                    ) from None

            cmats = {}
            for tpair in tqdm(timepoints, unit="cost matrix"):
                start_ixs = np.where(self.experimental_time == tpair[0])[0]
                end_ixs = np.where(self.experimental_time == tpair[1])[0]

                # being sparse is handled in WOT's function below
                cmats[tpair] = wot.ot.OTModel.compute_default_cost_matrix(
                    features[start_ixs], features[end_ixs]
                )

            return cmats, f"{modifier}:{cost_matrices}"

        raise NotImplementedError(
            f"Specifying cost matrices as "
            f"`{type(cost_matrices).__name__}` is not yet implemented."
        )
Ejemplo n.º 8
0
def trimap(
    adata: AnnData,
    n_components: int = 2,
    n_inliers: int = 10,
    n_outliers: int = 5,
    n_random: int = 5,
    metric: Literal['angular', 'euclidean', 'hamming',
                    'manhattan'] = 'euclidean',
    weight_adj: float = 500.0,
    lr: float = 1000.0,
    n_iters: int = 400,
    verbose: Union[bool, int, None] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    TriMap: Large-scale Dimensionality Reduction Using Triplets [Amid19]_.

    TriMap is a dimensionality reduction method that uses triplet constraints
    to form a low-dimensional embedding of a set of points. The triplet
    constraints are of the form "point i is closer to point j than point k".
    The triplets are sampled from the high-dimensional representation of the
    points and a weighting scheme is used to reflect the importance of each
    triplet.

    TriMap provides a significantly better global view of the data than the
    other dimensionality reduction methods such t-SNE, LargeVis, and UMAP.
    The global structure includes relative distances of the clusters, multiple
    scales in the data, and the existence of possible outliers. We define a
    global score to quantify the quality of an embedding in reflecting the
    global structure of the data.

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_components
        Number of dimensions of the embedding.
    n_inliers
        Number of inlier points for triplet constraints.
    n_outliers
        Number of outlier points for triplet constraints.
    n_random
        Number of random triplet constraints per point.
    metric
        Distance measure: 'angular', 'euclidean', 'hamming', 'manhattan'.
    weight_adj
        Adjusting the weights using a non-linear transformation.
    lr
        Learning rate.
    n_iters
        Number of iterations.
    verbose
        If `True`, print the progress report.
        If `None`, `sc.settings.verbosity` is used.
    copy
        Return a copy instead of writing to `adata`.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_trimap** : :class:`~numpy.ndarray`, (:attr:`~anndata.AnnData.obsm`, shape=(n_samples, n_components), dtype `float`)
        TriMap coordinates of data.

    Example
    -------
    
    >>> import scanpy as sc
    >>> import scanpy.external as sce
    >>> pbmc = sc.datasets.pbmc68k_reduced()
    >>> pbmc = sce.tl.trimap(pbmc, copy=True)
    >>> sce.pl.trimap(pbmc, color=['bulk_labels'], s=10)
    """

    try:
        from trimap import TRIMAP
    except ImportError:
        raise ImportError(
            '\nplease install trimap: \n\n\tsudo pip install trimap')
    adata = adata.copy() if copy else adata
    start = logg.info('computing TriMap')
    adata = adata.copy() if copy else adata
    verbosity = settings.verbosity if verbose is None else verbose
    verbose = verbosity if isinstance(verbosity, bool) else verbosity > 0

    if 'X_pca' in adata.obsm:
        n_dim_pca = adata.obsm['X_pca'].shape[1]
        X = adata.obsm['X_pca'][:, :min(n_dim_pca, 100)]
    else:
        X = adata.X
        if scp.issparse(X):
            raise ValueError(
                'trimap currently does not support sparse matrices. Please'
                'use a dense matrix or apply pca first.')
        logg.warning('`X_pca` not found. Run `sc.pp.pca` first for speedup.')
    X_trimap = TRIMAP(
        n_dims=n_components,
        n_inliers=n_inliers,
        n_outliers=n_outliers,
        n_random=n_random,
        lr=lr,
        distance=metric,
        weight_adj=weight_adj,
        n_iters=n_iters,
        verbose=verbose,
    ).fit_transform(X)
    adata.obsm['X_trimap'] = X_trimap
    logg.info(
        '    finished',
        time=start,
        deep="added\n    'X_trimap', TriMap coordinates (adata.obsm)",
    )
    return adata if copy else None
Ejemplo n.º 9
0
def test_slicing_remove_unused_categories():
    adata = AnnData(
        np.array([[1, 2], [3, 4], [5, 6], [7, 8]]),
        dict(k=['a', 'a', 'b', 'b']))
    adata._sanitize()
    assert adata[3:5].obs['k'].cat.categories.tolist() == ['b']
Ejemplo n.º 10
0
def filter_cells(
    data: AnnData,
    min_counts: Optional[int] = None,
    min_genes:  Optional[int] = None,
    max_counts: Optional[int] = None,
    max_genes:  Optional[int] = None,
    inplace: bool = True,
    copy: bool = False,
):
    """Filter cell outliers based on counts and numbers of genes expressed.

    For instance, only keep cells with at least `min_counts` counts or
    `min_genes` genes expressed. This is to filter measurement outliers,
    i.e. “unreliable” observations.

    Only provide one of the optional parameters ``min_counts``, ``min_genes``,
    ``max_counts``, ``max_genes`` per call.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape ``n_obs`` × ``n_vars``.
        Rows correspond to cells and columns to genes.
    min_counts
        Minimum number of counts required for a cell to pass filtering.
    min_genes
        Minimum number of genes expressed required for a cell to pass filtering.
    max_counts
        Maximum number of counts required for a cell to pass filtering.
    max_genes
        Maximum number of genes expressed required for a cell to pass filtering.
    inplace
        Perform computation inplace or return result.

    Returns
    -------
    `tuple`, `None`
        Depending on `inplace`, returns the following arrays or directly subsets
        and annotates the data matrix

        cells_subset : :class:`~numpy.ndarray`
            Boolean index mask that does filtering. `True` means that the
            cell is kept. `False` means the cell is removed.
        number_per_cell : :class:`~numpy.ndarray`
            Depending on what was tresholded (`counts` or `genes`), the array stores
            `n_counts` or `n_cells` per gene.

    Examples
    --------
    >>> adata = sc.datasets.krumsiek11()
    >>> adata.n_obs
    640
    >>> adata.var_names
    ['Gata2' 'Gata1' 'Fog1' 'EKLF' 'Fli1' 'SCL' 'Cebpa'
     'Pu.1' 'cJun' 'EgrNab' 'Gfi1']
    >>> # add some true zeros
    >>> adata.X[adata.X < 0.3] = 0
    >>> # simply compute the number of genes per cell
    >>> sc.pp.filter_cells(adata, min_genes=0)
    >>> adata.n_obs
    640
    >>> adata.obs['n_genes'].min()
    1
    >>> # filter manually
    >>> adata_copy = adata[adata.obs['n_genes'] >= 3]
    >>> adata_copy.obs['n_genes'].min()
    >>> adata.n_obs
    554
    >>> adata.obs['n_genes'].min()
    3
    >>> # actually do some filtering
    >>> sc.pp.filter_cells(adata, min_genes=3)
    >>> adata.n_obs
    554
    >>> adata.obs['n_genes'].min()
    3
    """
    if copy:
       logg.warn('`copy` is deprecated, use `inplace` instead.')
    n_given_options = sum(
        option is not None for option in
        [min_genes, min_counts, max_genes, max_counts])
    if n_given_options != 1:
        raise ValueError(
            'Only provide one of the optional parameters `min_counts`,'
            '`min_genes`, `max_counts`, `max_genes` per call.')
    if isinstance(data, AnnData):
        adata = data.copy() if copy else data
        cell_subset, number = materialize_as_ndarray(filter_cells(adata.X, min_counts, min_genes, max_counts, max_genes))
        if not inplace:
            return gene_subset, number
        if min_genes is None and max_genes is None: adata.obs['n_counts'] = number
        else: adata.obs['n_genes'] = number
        adata._inplace_subset_obs(cell_subset)
        return adata if copy else None
    X = data  # proceed with processing the data matrix
    min_number = min_counts if min_genes is None else min_genes
    max_number = max_counts if max_genes is None else max_genes
    number_per_cell = np.sum(X if min_genes is None and max_genes is None
                             else X > 0, axis=1)
    if issparse(X): number_per_cell = number_per_cell.A1
    if min_number is not None:
        cell_subset = number_per_cell >= min_number
    if max_number is not None:
        cell_subset = number_per_cell <= max_number

    s = np.sum(~cell_subset)
    if s > 0:
        logg.info('filtered out {} cells that have'.format(s), end=' ')
        if min_genes is not None or min_counts is not None:
            logg.info('less than',
                   str(min_genes) + ' genes expressed'
                   if min_counts is None else str(min_counts) + ' counts', no_indent=True)
        if max_genes is not None or max_counts is not None:
            logg.info('more than ',
                   str(max_genes) + ' genes expressed'
                   if max_counts is None else str(max_counts) + ' counts', no_indent=True)
    return cell_subset, number_per_cell
Ejemplo n.º 11
0
def combat(adata: AnnData, key: str = 'batch', inplace: bool = True):
    """
    ComBat function for batch effect correction [Johnson07]_ [Leek12]_ [Pedersen12]_.

    Corrects for batch effects by fitting linear models, gains statistical power
    via an EB framework where information is borrowed across genes. This uses the
    implementation of `ComBat <https://github.com/brentp/combat.py>`__ [Pedersen12]_.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix
    key: `str`, optional (default: `"batch"`)
        Key to a categorical annotation from adata.obs that will be used for batch effect removal
    inplace: bool, optional (default: `True`)
        Wether to replace adata.X or to return the corrected data

    Returns
    -------
    Depending on the value of inplace, either returns an updated AnnData object
        or modifies the passed one.
    """

    # check the input
    if key not in adata.obs.keys():
        raise ValueError(
            'Could not find the key {!r} in adata.obs'.format(key))

    # only works on dense matrices so far
    if issparse(adata.X):
        X = adata.X.A.T
    else:
        X = adata.X.T
    data = pd.DataFrame(
        data=X,
        index=adata.var_names,
        columns=adata.obs_names,
    )

    # construct a pandas series of the batch annotation
    batch = pd.Series(adata.obs[key])
    model = pd.DataFrame({'batch': batch})
    batch_items = model.groupby("batch").groups.items()
    batch_info = [v for k, v in batch_items]
    n_batch = len(batch_info)
    n_batches = np.array([len(v) for v in batch_info])
    n_array = float(sum(n_batches))

    # standardize across genes using a pooled variance estimator
    sys.stderr.write("Standardizing Data across genes.\n")
    s_data, design, var_pooled, stand_mean = stand_data(model, data)

    # fitting the parameters on the standardized data
    sys.stderr.write("Fitting L/S model and finding priors\n")
    batch_design = design[design.columns[:n_batch]]
    # first estimate of the additive batch effect
    gamma_hat = np.dot(
        np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T),
        s_data.T)
    delta_hat = []

    # first estimate for the multiplicative batch effect
    for i, batch_idxs in enumerate(batch_info):
        delta_hat.append(s_data[batch_idxs].var(axis=1))

    # empirically fix the prior hyperparameters
    gamma_bar = gamma_hat.mean(axis=1)
    t2 = gamma_hat.var(axis=1)
    # a_prior and b_prior are the priors on lambda and theta from Johnson and Li (2006)
    a_prior = list(map(aprior, delta_hat))
    b_prior = list(map(bprior, delta_hat))

    sys.stderr.write("Finding parametric adjustments\n")
    # gamma star and delta star will be our empirical bayes (EB) estimators
    # for the additive and multiplicative batch effect per batch and cell
    gamma_star, delta_star = [], []
    for i, batch_idxs in enumerate(batch_info):
        # temp stores our estimates for the batch effect parameters.
        # temp[0] is the additive batch effect
        # temp[1] is the multiplicative batch effect
        gamma, delta = _it_sol(
            s_data[batch_idxs].values,
            gamma_hat[i],
            delta_hat[i].values,
            gamma_bar[i],
            t2[i],
            a_prior[i],
            b_prior[i],
        )

        gamma_star.append(gamma)
        delta_star.append(delta)

    sys.stdout.write("Adjusting data\n")
    bayesdata = s_data
    gamma_star = np.array(gamma_star)
    delta_star = np.array(delta_star)

    # we now apply the parametric adjustment to the standardized data from above
    # loop over all batches in the data
    for j, batch_idxs in enumerate(batch_info):

        # we basically substract the additive batch effect, rescale by the ratio
        # of multiplicative batch effect to pooled variance and add the overall gene
        # wise mean
        dsq = np.sqrt(delta_star[j, :])
        dsq = dsq.reshape((len(dsq), 1))
        denom = np.dot(dsq, np.ones((1, n_batches[j])))
        numer = np.array(bayesdata[batch_idxs] -
                         np.dot(batch_design.loc[batch_idxs], gamma_star).T)
        bayesdata[batch_idxs] = numer / denom

    vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
    bayesdata = bayesdata * np.dot(vpsq, np.ones(
        (1, int(n_array)))) + stand_mean

    # put back into the adata object or return
    if inplace:
        adata.X = bayesdata.values.transpose()
    else:
        return bayesdata.values.transpose()
Ejemplo n.º 12
0
from itertools import product

import numpy as np
from numpy import ma
import pandas as pd
import pytest
from scipy import sparse as sp
from scipy.sparse import csr_matrix, isspmatrix_csr, issparse

from anndata import AnnData, Raw
from helpers import assert_equal, gen_adata

# some test objects that we use below
adata_dense = AnnData(np.array([[1, 2], [3, 4]]))
adata_sparse = AnnData(
    csr_matrix([[0, 2, 3], [0, 5, 6]]),
    dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
    dict(var_names=["a", "b", "c"]),
)


def test_creation():
    AnnData(np.array([[1, 2], [3, 4]]))
    AnnData(np.array([[1, 2], [3, 4]]), {}, {})
    AnnData(ma.array([[1, 2], [3, 4]]), uns=dict(mask=[0, 1, 1, 0]))
    AnnData(sp.eye(2))
    X = np.array([[1, 2, 3], [4, 5, 6]])
    adata = AnnData(
        X=X,
        obs=dict(Obs=["A", "B"]),
        var=dict(Feat=["a", "b", "c"]),
Ejemplo n.º 13
0
def circular_projection(
    adata: AnnData,
    keys: Union[str, Sequence[str]],
    backward: bool = False,
    lineages: Optional[Union[str, Sequence[str]]] = None,
    early_cells: Optional[Union[Mapping[str, Sequence[str]],
                                Sequence[str]]] = None,
    lineage_order: Optional[Literal["default", "optimal"]] = None,
    metric: Union[str, Callable, np.ndarray, pd.DataFrame] = "correlation",
    normalize_by_mean: bool = True,
    ncols: int = 4,
    space: float = 0.25,
    use_raw: bool = False,
    text_kwargs: Mapping[str, Any] = MappingProxyType({}),
    labeldistance: float = 1.25,
    labelrot: Union[Literal["default", "best"], float] = "best",
    show_edges: bool = True,
    key_added: Optional[str] = None,
    figsize: Optional[Tuple[float, float]] = None,
    dpi: Optional[int] = None,
    save: Optional[Union[str, Path]] = None,
    **kwargs: Any,
):
    r"""
    Plot absorption probabilities on a circular embedding as in :cite:`velten:17`.

    Parameters
    ----------
    %(adata)s
    keys
        Keys in :attr:`anndata.AnnData.obs` or :attr:`anndata.AnnData.var_names`. Additional keys are:

            - `'kl_divergence'` - as in :cite:`velten:17`, computes KL-divergence between the fate probabilities
              of a cell and the average fate probabilities. See ``early_cells`` for more information.
            - `'entropy'` - as in :cite:`setty:19`, computes entropy over a cells fate probabilities.

    %(backward)s
    lineages
        Lineages to plot. If `None`, plot all lineages.
    early_cells
        Cell ids or a mask marking early cells used to define the average fate probabilities. If `None`, use all cells.
        Only used when `'kl_divergence'` is in ``keys``. If a :class:`dict`, key specifies a cluster key in
        :attr:`anndata.AnnData.obs` and the values specify cluster labels containing early cells.
    lineage_order
        Can be one of the following:

            - `None` - it will determined automatically, based on the number of lineages.
            - `'optimal'` - order lineages optimally by solving the Travelling salesman problem (TSP).
              Recommended for <= `20` lineages.
            - `'default'` - use the order as specified in ``lineages``.

    metric
        Metric to use when constructing pairwise distance matrix when ``lineage_order = 'optimal'``. For available
        options, see :func:`sklearn.metrics.pairwise_distances`.
    normalize_by_mean
        If `True`, normalize each lineage by its mean probability, as done in :cite:`velten:17`.
    ncols
        Number of columns when plotting multiple ``keys``.
    space
        Horizontal and vertical space between for :func:`matplotlib.pyplot.subplots_adjust`.
    use_raw
        Whether to access :attr:`anndata.AnnData.raw` when there are ``keys`` in :attr:`anndata.AnnData.var_names`.
    text_kwargs
        Keyword arguments for :func:`matplotlib.pyplot.text`.
    labeldistance
        Distance at which the lineage labels will be drawn.
    labelrot
        How to rotate the labels. Valid options are:

            - `'best'` - rotate labels so that they are easily readable.
            - `'default'` - use :mod:`matplotlib`'s default.
            - `None` - same as `'default'`.

        If a :class:`float`, all labels will be rotated by this many degrees.
    show_edges
        Whether to show the edges surrounding the simplex.
    key_added
        Key in :attr:`anndata.AnnData.obsm` where to add the circular embedding. If `None`, it will be set to
        `'X_fate_simplex_{fwd,bwd}'`, based on ``backward``.
    %(plotting)s
    kwargs
        Keyword arguments for :func:`scvelo.pl.scatter`.

    Returns
    -------
    %(just_plots)s
        Also updates ``adata`` with the following fields:

            - :attr:`anndata.AnnData.obsm` ``['{key_added}']`` - the circular projection.
            - :attr:`anndata.AnnData.obs` ``['to_{initial,terminal}_states_{method}']`` - the priming degree,
              if a method is present in ``keys``.
    """
    if labeldistance is not None and labeldistance < 0:
        raise ValueError(
            f"Expected `delta` to be positive, found `{labeldistance}`.")

    if labelrot is None:
        labelrot = LabelRot.DEFAULT
    if isinstance(labelrot, str):
        labelrot = LabelRot(labelrot)

    suffix = "bwd" if backward else "fwd"
    if key_added is None:
        key_added = "X_fate_simplex_" + suffix

    if isinstance(keys, str):
        keys = (keys, )

    keys = _unique_order_preserving(keys)
    keys_ = _check_collection(
        adata, keys, "obs", key_name="Observation",
        raise_exc=False) + _check_collection(adata,
                                             keys,
                                             "var_names",
                                             key_name="Gene",
                                             raise_exc=False,
                                             use_raw=use_raw)
    haystack = {s.s for s in PrimingDegree}
    keys = keys_ + [k for k in keys if k in haystack]
    keys = _unique_order_preserving(keys)

    if not len(keys):
        raise ValueError("No valid keys have been selected.")

    lineage_key = str(AbsProbKey.BACKWARD if backward else AbsProbKey.FORWARD)
    if lineage_key not in adata.obsm:
        raise KeyError(
            f"Lineages key `{lineage_key!r}` not found in `adata.obsm`.")

    probs = adata.obsm[lineage_key]

    if isinstance(lineages, str):
        lineages = (lineages, )
    elif lineages is None:
        lineages = probs.names

    probs: Lineage = adata.obsm[lineage_key][lineages]
    n_lin = probs.shape[1]
    if n_lin < 3:
        raise ValueError(f"Expected at least `3` lineages, found `{n_lin}`.")

    X = probs.X.copy()
    if normalize_by_mean:
        X /= np.mean(X, axis=0)[None, :]
        X /= X.sum(1)[:, None]
        # this happens when cells for sel. lineages sum to 1 (or when the lineage average is 0, which is unlikely)
        X = np.nan_to_num(X, nan=1.0 / n_lin, copy=False)

    if lineage_order is None:
        lineage_order = (LineageOrder.OPTIMAL
                         if 3 < n_lin <= 20 else LineageOrder.DEFAULT)
        logg.debug(f"Set ordering to `{lineage_order}`")
    lineage_order = LineageOrder(lineage_order)

    if lineage_order == LineageOrder.OPTIMAL:
        logg.info(f"Solving TSP for `{n_lin}` states")
        _, order = _get_optimal_order(X, metric=metric)
    else:
        order = np.arange(n_lin)

    probs = probs[:, order]
    X = X[:, order]

    angle_vec = np.linspace(0, 2 * np.pi, n_lin, endpoint=False)
    angle_vec_sin = np.cos(angle_vec)
    angle_vec_cos = np.sin(angle_vec)

    x = np.sum(X * angle_vec_sin, axis=1)
    y = np.sum(X * angle_vec_cos, axis=1)
    adata.obsm[key_added] = np.c_[x, y]

    nrows = int(np.ceil(len(keys) / ncols))
    fig, ax = plt.subplots(
        nrows=nrows,
        ncols=ncols,
        figsize=(ncols * 5, nrows * 5) if figsize is None else figsize,
        dpi=dpi,
    )

    fig.subplots_adjust(wspace=space, hspace=space)
    axes = np.ravel([ax])

    text_kwargs = dict(text_kwargs)
    text_kwargs["ha"] = "center"
    text_kwargs["va"] = "center"

    _i = 0
    for _i, (k, ax) in enumerate(zip(keys, axes)):

        set_lognorm, colorbar = False, kwargs.pop("colorbar", True)
        try:
            _ = PrimingDegree(k)
            logg.debug(f"Calculating priming degree using `method={k}`")
            val = probs.priming_degree(method=k, early_cells=early_cells)
            k = f"{lineage_key}_{k}"
            adata.obs[k] = val
        except ValueError:
            pass

        scv.pl.scatter(
            adata,
            basis=key_added,
            color=k,
            show=False,
            ax=ax,
            use_raw=use_raw,
            norm=LogNorm() if set_lognorm else None,
            colorbar=colorbar,
            **kwargs,
        )
        if colorbar and set_lognorm:
            cbar = ax.collections[0].colorbar
            cax = cbar.locator.axis
            ticks = cax.minor.locator.tick_values(cbar.vmin, cbar.vmax)
            ticks = [ticks[0], ticks[len(ticks) // 2 + 1], ticks[-1]]
            cbar.set_ticks(ticks)
            cbar.set_ticklabels([f"{t:.2f}" for t in ticks])
            cbar.update_ticks()

        patches, texts = ax.pie(
            np.ones_like(angle_vec),
            labeldistance=labeldistance,
            rotatelabels=True,
            labels=probs.names[::-1],
            startangle=-360 / len(angle_vec) / 2,
            counterclock=False,
            textprops=text_kwargs,
        )

        for patch in patches:
            patch.set_visible(False)

        # clockwise
        for color, text in zip(probs.colors[::-1], texts):
            if isinstance(labelrot, (int, float)):
                text.set_rotation(labelrot)
            elif labelrot == LabelRot.BEST:
                rot = text.get_rotation()
                text.set_rotation(rot + 90 + (1 - rot // 180) * 180)
            elif labelrot != LabelRot.DEFAULT:
                raise NotImplementedError(
                    f"Label rotation `{labelrot}` is not yet implemented.")
            text.set_color(color)

        if not show_edges:
            continue

        for i, color in enumerate(probs.colors):
            next = (i + 1) % n_lin
            x = 1.04 * np.linspace(angle_vec_sin[i], angle_vec_sin[next], _N)
            y = 1.04 * np.linspace(angle_vec_cos[i], angle_vec_cos[next], _N)
            points = np.array([x, y]).T.reshape(-1, 1, 2)
            segments = np.concatenate([points[:-1], points[1:]], axis=1)

            cmap = LinearSegmentedColormap.from_list(
                "abs_prob_cmap", [color, probs.colors[next]], N=_N)
            lc = LineCollection(segments, cmap=cmap, zorder=-1)
            lc.set_array(np.linspace(0, 1, _N))
            lc.set_linewidth(2)
            ax.add_collection(lc)

    for j in range(_i + 1, len(axes)):
        axes[j].remove()

    if save is not None:
        save_fig(fig, save)
Ejemplo n.º 14
0
def SCALEX(data_list,
           batch_categories=None,
           profile='RNA',
           join='inner',
           batch_key='batch',
           batch_name='batch',
           min_features=600,
           min_cells=3,
           n_top_features=2000,
           batch_size=64,
           lr=2e-4,
           max_iteration=30000,
           seed=124,
           gpu=0,
           outdir='output/',
           projection=None,
           repeat=False,
           impute=None,
           chunk_size=20000,
           ignore_umap=False,
           verbose=False,
           assess=False,
           show=False,
           processed=False):
    """
    Single-Cell integrative Analysis via Latent feature Extraction
    
    Parameters
    ----------
    data_list
        A path list of AnnData matrices to concatenate with. Each matrix is referred to as a 'batch'.
    batch_categories
        Categories for the batch annotation. By default, use increasing numbers.
    profile
        Specify the single-cell profile, RNA or ATAC. Default: RNA.
    join
        Use intersection ('inner') or union ('outer') of variables of different batches. 
    batch_key
        Add the batch annotation to obs using this key. By default, batch_key='batch'.
    batch_name
        Use this annotation in obs as batches for training model. Default: 'batch'.
    min_features
        Filtered out cells that are detected in less than min_features. Default: 600.
    min_cells
        Filtered out genes that are detected in less than min_cells. Default: 3.
    n_top_features
        Number of highly-variable genes to keep. Default: 2000.
    batch_size
        Number of samples per batch to load. Default: 64.
    lr
        Learning rate. Default: 2e-4.
    max_iteration
        Max iterations for training. Training one batch_size samples is one iteration. Default: 30000.
    seed
        Random seed for torch and numpy. Default: 124.
    gpu
        Index of GPU to use if GPU is available. Default: 0.
    outdir
        Output directory. Default: 'output/'.
    projection
        Use for new dataset projection. Input the folder containing the pre-trained model. If None, don't do projection. Default: None. 
    repeat
        Use with projection. If False, concatenate the reference and projection datasets for downstream analysis. If True, only use projection datasets. Default: False.
    impute
        If True, calculate the imputed gene expression and store it at adata.layers['impute']. Default: False.
    chunk_size
        Number of samples from the same batch to transform. Default: 20000.
    ignore_umap
        If True, do not perform UMAP for visualization and leiden for clustering. Default: False.
    verbose
        Verbosity, True or False. Default: False.
    assess
        If True, calculate the entropy_batch_mixing score and silhouette score to evaluate integration results. Default: False.
    
    Returns
    -------
    The output folder contains:
    adata.h5ad
        The AnnData matrice after batch effects removal. The low-dimensional representation of the data is stored at adata.obsm['latent'].
    checkpoint
        model.pt contains the variables of the model and config.pt contains the parameters of the model.
    log.txt
        Records raw data information, filter conditions, model parameters etc.
    umap.pdf 
        UMAP plot for visualization.
    """

    np.random.seed(seed)  # seed
    torch.manual_seed(seed)

    if torch.cuda.is_available():  # cuda device
        device = 'cuda'
        torch.cuda.set_device(gpu)
    else:
        device = 'cpu'

    outdir = outdir + '/'
    os.makedirs(outdir + '/checkpoint', exist_ok=True)
    log = create_logger('', fh=outdir + 'log.txt')
    if not projection:
        adata, trainloader, testloader = load_data(
            data_list,
            batch_categories,
            join=join,
            profile=profile,
            n_top_features=n_top_features,
            batch_size=batch_size,
            chunk_size=chunk_size,
            min_features=min_features,
            min_cells=min_cells,
            batch_name=batch_name,
            batch_key=batch_key,
            log=log,
            processed=processed)

        early_stopping = EarlyStopping(patience=10,
                                       checkpoint_file=outdir +
                                       '/checkpoint/model.pt')
        x_dim, n_domain = adata.shape[1], len(
            adata.obs['batch'].cat.categories)

        # model config
        enc = [['fc', 1024, 1, 'relu'], ['fc', 10, '', '']]  # TO DO
        # enc = [['fc', 32, 1, 'relu'],['fc', 10, '', '']]
        dec = [['fc', x_dim, n_domain, 'sigmoid']]

        model = VAE(enc, dec, n_domain=n_domain)

        log.info('model\n' + model.__repr__())
        model.fit(
            trainloader,
            lr=lr,
            max_iteration=max_iteration,
            device=device,
            early_stopping=early_stopping,
            verbose=verbose,
        )
        torch.save(
            {
                'n_top_features': adata.var.index,
                'enc': enc,
                'dec': dec,
                'n_domain': n_domain
            }, outdir + '/checkpoint/config.pt')
    else:
        state = torch.load(projection + '/checkpoint/config.pt')
        n_top_features, enc, dec, n_domain = state['n_top_features'], state[
            'enc'], state['dec'], state['n_domain']
        model = VAE(enc, dec, n_domain=n_domain)
        model.load_model(projection + '/checkpoint/model.pt')
        model.to(device)

        adata, trainloader, testloader = load_data(
            data_list,
            batch_categories,
            join='outer',
            profile=profile,
            chunk_size=chunk_size,
            n_top_features=n_top_features,
            min_cells=0,
            min_features=min_features,
            batch_name=batch_name,
            batch_key=batch_key,
            log=log)


#         log.info('Processed dataset shape: {}'.format(adata.shape))

    adata.obsm['latent'] = model.encodeBatch(testloader,
                                             device=device)  # save latent rep
    if impute:
        adata.layers['impute'] = model.encodeBatch(testloader,
                                                   out='impute',
                                                   batch_id=impute,
                                                   device=device)
    log.info('Output dir: {}'.format(outdir))

    if projection and (not repeat):
        ref = sc.read_h5ad(projection + '/adata.h5ad')
        adata = AnnData.concatenate(ref,
                                    adata,
                                    batch_categories=['reference', 'query'],
                                    batch_key='projection',
                                    index_unique=None)
    adata.write(outdir + 'adata.h5ad', compression='gzip')
    if not ignore_umap:  #and adata.shape[0]<1e6:
        log.info('Plot umap')
        sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent')
        sc.tl.umap(adata, min_dist=0.1)
        sc.tl.leiden(adata)

        # UMAP visualization
        sc.settings.figdir = outdir
        sc.set_figure_params(dpi=80, figsize=(10, 10), fontsize=20)
        cols = ['batch', 'celltype', 'leiden']
        color = [c for c in cols if c in adata.obs]
        if len(color) > 0:
            if projection and (not repeat):
                embedding(adata, groupby='projection', save='.pdf', show=show)
            else:
                sc.pl.umap(adata,
                           color=color,
                           save='.pdf',
                           wspace=0.4,
                           ncols=4,
                           show=show)

        if assess:
            if len(adata.obs['batch'].cat.categories) > 1:
                entropy_score = batch_entropy_mixing_score(
                    adata.obsm['X_umap'], adata.obs['batch'])
                log.info(
                    'batch_entropy_mixing_score: {:.3f}'.format(entropy_score))

            if 'celltype' in adata.obs:
                sil_score = silhouette_score(adata.obsm['X_umap'],
                                             adata.obs['celltype'].cat.codes)
                log.info("silhouette_score: {:.3f}".format(sil_score))

    adata.write(outdir + 'adata.h5ad', compression='gzip')

    return adata
Ejemplo n.º 15
0
def test_normalize_total_layers(typ, dtype):
    adata = AnnData(typ(X_total), dtype=dtype)
    adata.layers["layer"] = adata.X.copy()
    sc.pp.normalize_total(adata, layers=["layer"])
    assert np.allclose(adata.layers["layer"].sum(axis=1), [3.0, 3.0, 3.0])
Ejemplo n.º 16
0
def visualize_dictionary(ct,
                         X_dimred,
                         genes,
                         cell_types,
                         namespace,
                         dag_method,
                         verbose=True):
    from anndata import AnnData
    from scanorama import visualize
    import scanpy as sc
    import seaborn as sns

    # KNN and UMAP.

    if verbose:
        tprint('Constructing KNN graph...')
    adata = AnnData(X=X_dimred)
    sc.pp.neighbors(adata, use_rep='X')

    if verbose:
        tprint('Visualizing with UMAP...')
    sc.tl.umap(adata, min_dist=0.5)
    embedding = np.array(adata.obsm['X_umap'])
    embedding[embedding < -20] = -20
    embedding[embedding > 20] = 20

    # Visualize cell types.

    le = LabelEncoder().fit(cell_types)
    cell_types_int = le.transform(cell_types)
    visualize(None,
              cell_types_int,
              '{}_pan_umap_{}_type'.format(namespace, dag_method),
              np.array(sorted(set(cell_types))),
              embedding=embedding,
              image_suffix='.png')

    #max_intensity = ct.labels_.max()

    for c_idx in range(ct.labels_.shape[1]):
        intensity = ct.labels_[:, c_idx]
        intensity /= intensity.max()

        print('\nCluster {}'.format(c_idx))

        print_cell_types(cell_types, intensity)

        # Visualize cluster in UMAP coordinates.

        plt.figure()
        plt.title('Cluster {}'.format(c_idx))
        plt.scatter(embedding[:, 0],
                    embedding[:, 1],
                    c=intensity,
                    cmap=cm.get_cmap('Blues'),
                    s=1)
        plt.savefig('{}_pan_umap_{}_cluster{}.png'.format(
            namespace, dag_method, c_idx),
                    dpi=500)

        plt.figure()
        plt.title('Cluster {}'.format(c_idx))
        plt.hist(intensity.flatten(), bins=100)
        plt.savefig('{}_pan_umap_{}_intensehist{}.png'.format(
            namespace, dag_method, c_idx),
                    dpi=500)

        intensity = (intensity > 0.8) * 1

        plt.figure()
        plt.title('Cluster {}'.format(c_idx))
        plt.scatter(embedding[:, 0],
                    embedding[:, 1],
                    c=intensity,
                    cmap=cm.get_cmap('Blues'),
                    s=1)
        plt.savefig('{}_pan_umap_{}_member{}.png'.format(
            namespace, dag_method, c_idx),
                    dpi=500)

    for c_idx in range(ct.labels_.shape[1]):

        # Visualize covariance matrix.

        corr = ct.dictionary_[:, :, c_idx]
        corr[np.isnan(corr)] = 0

        #print('\nCluster {}'.format(c_idx))

        #print_gene_modules(corr, genes)

        gene_idx = np.sum(np.abs(corr), axis=1) > 0
        if np.sum(gene_idx) == 0:
            continue
        corr = corr[gene_idx]
        corr = corr[:, gene_idx]

        plt.figure()
        plt.title('Cluster {}'.format(c_idx))
        plt.rcParams.update({'font.size': 5})
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        corr_max = max(corr.max(), abs(corr.min()))
        sns.clustermap(corr,
                       xticklabels=genes[gene_idx],
                       yticklabels=genes[gene_idx],
                       cmap=cmap,
                       vmin=-corr_max,
                       vmax=corr_max)
        plt.xticks(rotation=90)
        plt.yticks(rotation=90)
        plt.savefig('{}_pan_cov_{}_cluster{}.png'.format(
            namespace, dag_method, c_idx),
                    dpi=500)
Ejemplo n.º 17
0
def correlate_tf_motifs(
    adata: AnnData,
    n_sketch: Optional[int] = 2500,
    n_permutations: Optional[int] = 100000,
    indirect: Optional[bool] = True,
) -> None:
    """Correlate inferred motif activity with TF expression.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    n_sketch : `int`, optional (default: 2500)
        If the number of cells is higher than `n_sketch`, use geometric
        sketching (Hie et al. 2019) to select a subset of `n_sketch`
        cells. This subset will be used to calculate the correlation beteen
        motif activity and transcription factor expression.
    n_permutations : `int`, optional (default: 100000)
        Number of permutations that is used to calculate the p-value. Can be
        decreased for quicker run-time, but should probably not be below 10000.
    indirect : `bool`, optional (default: True)
        Include indirect TF to motif assignments.
    """
    logger.info("correlating motif activity with factors")
    if indirect:
        logger.info("including indirect and/or predicted factors")
    # Get all TFs from motif database
    m2f = motif_mapping(indirect=True)
    batch_size = m2f.shape[0]
    f2m2 = pd.DataFrame(m2f["factors"].str.split(",").tolist(),
                        index=m2f.index).stack()
    f2m2 = f2m2.to_frame().reset_index().iloc[:, [0, 2]]
    f2m2.columns = ["motif", "factor"]
    unique_factors = f2m2["factor"].unique()

    if n_sketch is None or n_sketch > adata.shape[0]:
        logger.info(f"using all cells")
        my_adata = adata
    else:
        logger.info(f"creating sketch of {n_sketch} cells")
        idx = geosketch.gs(adata.obsm["X_pca"], n_sketch)
        my_adata = adata.copy()
        my_adata = my_adata[idx]

    detected = (my_adata.raw.var_names.str.upper().isin(unique_factors)) & (
        (my_adata.raw.X > 0).sum(0) > 3)
    detected = np.squeeze(np.asarray(detected))
    unique_factors = my_adata.raw.var_names[detected].str.upper()

    # Get the expression for all TFs
    expression = (np.squeeze(np.asarray(my_adata.raw.X.todense()))
                  if issparse(my_adata.raw.X) else my_adata.raw.X)
    expression = expression.T[detected]

    logger.info(
        f"calculating correlation of motif activity with {len(unique_factors)} factors"
    )
    real = fast_corr(
        expression,
        (my_adata.obsm["X_cell_types"]
         @ my_adata.uns["scepia"]["motif_activity"].T).T.values,
    )
    real = pd.DataFrame(
        real,
        index=unique_factors,
        columns=my_adata.uns["scepia"]["motif_activity"].index,
    )

    tmp = (real.reset_index().melt(
        id_vars="index", var_name="motif",
        value_name="correlation").rename(columns={
            "index": "factor"
        }).set_index(["motif", "factor"]))
    f2m2 = f2m2.set_index(["motif", "factor"]).join(tmp).dropna()
    f2m2["abs_correlation"] = f2m2["correlation"].abs()

    logger.info(f"calculating {n_permutations} permutations")
    permute_result = pd.DataFrame(index=unique_factors)
    shape = my_adata.uns["scepia"]["motif_activity"].shape
    for i in tqdm(range(0, n_permutations, batch_size)):
        random_activities = None
        while random_activities is None or random_activities.shape[
                0] < batch_size:
            x = my_adata.uns["scepia"]["motif_activity"].values.flatten()
            motif_activity = shuffle(x).reshape(shape[1], shape[0])
            cell_motif_activity = (
                my_adata.obsm["X_cell_types"] @ motif_activity).T
            if random_activities is None:
                random_activities = cell_motif_activity
            else:
                random_activities = np.vstack(
                    (random_activities, cell_motif_activity))

        random_activities = random_activities[:batch_size]
        batch_result = fast_corr(expression, random_activities)
        batch_result = pd.DataFrame(batch_result,
                                    index=unique_factors,
                                    columns=range(i, i + batch_size))
        permute_result = permute_result.join(batch_result)

    logger.info("calculating permutation-based p-values (all)")

    # Calculate p-value of correlation relative to all permuted correlations
    permuted_corrs = permute_result.values.flatten()
    pvals = [(100 - percentileofscore(permuted_corrs, corr)) / 100
             for corr in f2m2["correlation"]]
    f2m2["pval"] = pvals
    f2m2.loc[f2m2["correlation"] < 0,
             "pval"] = (1 - f2m2.loc[f2m2["correlation"] < 0, "pval"])
    logger.info("calculating permutation-based p-values (factor-specific)")

    # Calculate p-value of correlation relative to permutated value of this factor
    for motif, factor in tqdm(f2m2.index):
        pval = (100 - percentileofscore(permute_result.loc[factor],
                                        real.loc[factor, motif])) / 100
        pval = 1 - pval if real.loc[factor, motif] < 0 else pval
        pval = 1 / permute_result.shape[1] if pval == 0 else pval
        f2m2.loc[(motif, factor), "permutation_pval"] = pval
        f2m2.loc[(motif, factor), "combined"] = combine_pvalues(
            f2m2.loc[(motif, factor), ["pval", "permutation_pval"]])[1]

    f2m2["p_adj"] = multipletests(f2m2["combined"], method="fdr_bh")[1]
    f2m2["-log10(p-value)"] = -np.log10(f2m2["p_adj"])

    cluster_cell_types = adata.obs["cluster_annotation"].unique()
    f2m2 = f2m2.join(
        (adata.uns["scepia"]["motif_activity"][cluster_cell_types].max(1) -
         adata.uns["scepia"]["motif_activity"][cluster_cell_types].min(1)
         ).to_frame("motif_stddev").rename_axis("motif"))

    f2m2 = f2m2.reset_index().set_index("factor")
    adata.uns["scepia"]["correlation"] = f2m2
Ejemplo n.º 18
0
def infer_motifs(
    adata: AnnData,
    dataset: str,
    cluster: Optional[str] = "louvain",
    n_top_genes: Optional[int] = 1000,
    max_cell_types: Optional[int] = 50,
    pfm: Optional[str] = None,
    min_annotated: Optional[int] = 50,
    num_enhancers: Optional[int] = 10000,
    maelstrom: Optional[bool] = False,
    indirect: Optional[bool] = True,
    n_sketch: Optional[int] = 2500,
    n_permutations: Optional[int] = 100000,
) -> None:
    """Infer motif ativity for single cell RNA-seq data.

    The adata object is modified with the following fields.

    **X_cell_types** : `adata.obsm` field
        Cell type coefficients.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    dataset : `str`
        Name of reference data set or directory with reference data.
    cluster : `str`, optional (default: "louvain")
        Name of the clustering, can be either louvain or leiden.
    n_top_genes : `int`, optional (default: 1000)
        Number of variable genes that is used. If `n_top_genes` is greater than the
        number of hypervariable genes in `adata` then all variable genes are
        used.
    max_cell_types : `int`, optional (default: 50)
        Maximum number of cell types to select.
    pfm : `str`, optional (default: None)
        Name of motif file in PFM format. The GimmeMotifs default is used
        if this parameter is not specified. This can be a filename, or a
        pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`.
        If a custom PFM file is specified, there should also be an associated
        `.motif2factors.txt` file.
    min_annotated : `int`, optional (default: 50)
        Cells that are annotated with cell types less than this number will be
        annotated as "other".
    num_enhancers : `int`, optional (default: 10000)
        Number of enhancers to use for motif activity analysis.
    maelstrom : `boolean`, optional (default: False)
        Use maelstrom instead of ridge regression for motif activity analysis.
    """

    use_name = True

    validate_adata(adata)

    data = ScepiaDataset(dataset)

    if "scepia" not in adata.uns:
        adata.uns["scepia"] = {"version": __version__}

    # Annotate each cell with H3K27ac reference
    if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs:
        annotate_cells(
            adata,
            dataset=dataset,
            cluster=cluster,
            n_top_genes=n_top_genes,
            min_annotated=min_annotated,
            max_cell_types=max_cell_types,
        )

    logger.info("Linking variable genes to differential enhancers.")
    gene_map_file = data.gene_mapping

    link_file = data.link_file
    link = pd.read_feather(link_file)
    if use_name:
        ens2name = pd.read_csv(gene_map_file,
                               sep="\t",
                               index_col=0,
                               names=["identifier", "name"])
        link = link.join(ens2name, on="gene").dropna()
        link = link.set_index("name")

    link.index = link.index.str.upper()
    enh_genes = adata.var_names[adata.var_names.str.upper().isin(
        link.index)].str.upper()
    var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique()

    enhancer_df = data.load_reference_data(reftype="enhancer")
    enhancer_df.index = change_region_size(enhancer_df.index)
    enhancer_df = enhancer_df.loc[var_enhancers,
                                  adata.uns["scepia"]["cell_types"]]
    enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean()
    enhancer_df.loc[:, :] = scale(enhancer_df)

    main_cell_types = pd.concat((
        adata.obs["cluster_annotation"].astype(str),
        adata.obs["cell_annotation"].astype(str),
    ))
    main_cell_types = [x for x in main_cell_types.unique() if x != "other"]

    # Select top most variable enhancers of the most important annotated cell types
    enhancer_df = enhancer_df.loc[enhancer_df[main_cell_types].var(
        1).sort_values().tail(num_enhancers).index]
    # Center by mean of the most import cell types
    # Here we chose the majority cell type per cluster
    mean_value = enhancer_df[main_cell_types].mean(1)
    enhancer_df = enhancer_df.sub(mean_value, axis=0)
    fname = NamedTemporaryFile(delete=False).name
    enhancer_df.to_csv(fname, sep="\t")
    logger.info("inferring motif activity")

    pfm = pfmfile_location(pfm)
    if maelstrom:
        with TemporaryDirectory() as tmpdir:
            run_maelstrom(
                fname,
                data.genome,
                tmpdir,
                center=False,
                filter_redundant=True,
            )

            motif_act = pd.read_csv(
                os.path.join(tmpdir, "final.out.txt"),
                sep="\t",
                comment="#",
                index_col=0,
            )
            motif_act.columns = motif_act.columns.str.replace(
                r"z-score\s+", "")
            pfm = pfmfile_location(
                os.path.join(tmpdir, "nonredundant.motifs.pfm"))
    else:
        logger.info(f"Activity based on genome {data.genome}")
        motif_act = moap(
            fname,
            scoring="score",
            genome=data.genome,
            method="bayesianridge",
            pfmfile=pfm,
            ncpus=12,
        )
    adata.uns["scepia"]["pfm"] = pfm

    adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"]
                                                      ["cell_types"]]

    logger.info("calculating cell-specific motif activity")
    cell_motif_activity = (
        adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T
    cell_motif_activity.index = adata.obs_names
    adata.obs = adata.obs.drop(
        columns=cell_motif_activity.columns.intersection(adata.obs.columns))
    adata.obs = adata.obs.join(cell_motif_activity)

    correlate_tf_motifs(adata,
                        indirect=indirect,
                        n_sketch=n_sketch,
                        n_permutations=n_permutations)

    add_activity(adata)

    logger.info("Done with motif inference.")
Ejemplo n.º 19
0
def test_from_df_and_dict():
    df = pd.DataFrame(dict(a=[0.1, 0.2, 0.3], b=[1.1, 1.2, 1.3]))
    adata = AnnData(df, dict(species=pd.Categorical(["a", "b", "a"])))
    assert adata.obs["species"].values.tolist() == ["a", "b", "a"]
Ejemplo n.º 20
0
def test_strings_to_categoricals():
    adata = AnnData(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]),
                    dict(k=["a", "a", "b", "b"]))
    adata.strings_to_categoricals()
    assert adata.obs["k"].cat.categories.tolist() == ["a", "b"]
Ejemplo n.º 21
0
def flat_model(
    adata: AnnData,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    deg_corr: bool = True,
    multiflip: bool = True,
    fast_model: bool = False,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 100.),
    steps_anneal: int = 5,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'sbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},    
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the  Stochastic Block Model [Peixoto14]_, performing
    Bayesian inference on node groups. 

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    multiflip
        Whether to perform MCMC sweep with multiple simultaneous moves to sample
        network partitions. It may result in slightly longer runtimes, but under
        the hood it allows for a more efficient space exploration.
    fast_model
        Whether to skip initial minization step and let the MCMC find a solution. 
        This approach tend to be faster and consume less memory, but 
        less accurate.
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['sbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['sbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['sbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['sbm']['state']`
        The BlockModel state object
    """

    raise DeprecationWarning("""This function has been deprecated since version 
    0.5.0, please consider usage of planted_model instead.
    """)

    if fast_model or resume: 
        # if the fast_model is chosen perform equilibration anyway
        equilibrate=True
        
    if resume and ('sbm' not in adata.uns or 'state' not in adata.uns['sbm']):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume=False
        fast_model=False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`"
            )

    start = logg.info('minimizing the Stochastic Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs=[]
    rec_types=[]
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs=[g.ep.weight]
        rec_types=['real-normal']

    if fast_model:
        # do not minimize, start with a dummy state and perform only equilibrate
        state = gt.BlockState(g=g, B=1, sampling=True,
                              state_args=dict(deg_corr=deg_corr,
                              recs=recs,
                              rec_types=rec_types
                              ))
    elif resume:
        # create the state and make sure sampling is performed
        state = adata.uns['sbm']['state'].copy(sampling=True)
        g = state.g
    else:
        if n_init < 1:
            n_init = 1
        
        states = [gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr, 
                  state_args=dict(recs=recs,  rec_types=rec_types), 
                  **minimize_args) for n in range(n_init)]
                  
        state = states[np.argmin([s.entropy() for s in states])]    

        logg.info('    done', time=start)
        state = state.copy(B=g.num_vertices())
    
    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['multiflip'] = multiflip
        equilibrate_args['mcmc_args'] = {'niter':10}
        
        dS, nattempts, nmoves = gt.mcmc_anneal(state, 
                                               mcmc_equilibrate_args=equilibrate_args,
                                               niter=steps_anneal,
                                               beta_range=beta_range)

    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = np.zeros(g.num_vertices() + 1)
        def _collect_marginals(s):
            group_marginals[s.get_nonempty_B()] += 1

        gt.mcmc_equilibrate(state, wait=wait, nbreaks=nbreaks, epsilon=epsilon,
                            max_niter=max_iterations, multiflip=False,
                            force_niter=niter_collect, mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = pd.Series(state.get_blocks().get_array()).astype('category')
    new_cat_names = dict([(cx, u'%s' % cn) for cn, cx in enumerate(groups.cat.categories)])
    groups.cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    adata.obs.loc[:, key_added] = groups

    # add some unstructured info

    adata.uns['sbm'] = {}
    adata.uns['sbm']['stats'] = dict(
    dS=dS,
    nattempts=nattempts,
    nmoves=nmoves,
    modularity=gt.modularity(g, state.get_blocks())
    )
    adata.uns['sbm']['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # cell marginals will be a list of arrays with probabilities
        # of belonging to a specific group
        adata.uns['sbm']['group_marginals'] = group_marginals

    # calculate log-likelihood of cell moves over the remaining levels
    
    adata.uns['sbm']['cell_affinity'] = {'1':get_cell_loglikelihood(state, as_prob=True)}
    
    # last step is recording some parameters used in this analysis
    adata.uns['sbm']['params'] = dict(
        epsilon=epsilon,
        wait=wait,
        nbreaks=nbreaks,
        equilibrate=equilibrate,
        fast_model=fast_model,
        collect_marginals=collect_marginals,
        random_seed=random_seed
    )


    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {state.get_nonempty_B()} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        ),
    )
    return adata if copy else None
Ejemplo n.º 22
0
def test_slicing_remove_unused_categories():
    adata = AnnData(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]),
                    dict(k=["a", "a", "b", "b"]))
    adata._sanitize()
    assert adata[2:4].obs["k"].cat.categories.tolist() == ["b"]
Ejemplo n.º 23
0
def filter_genes(
    data: AnnData,
    min_counts: Optional[int] = None,
    min_cells:  Optional[int] = None,
    max_counts: Optional[int] = None,
    max_cells:  Optional[int] = None,
    inplace: bool = True,
    copy: bool = False,
):
    """Filter genes based on number of cells or counts.

    Keep genes that have at least ``min_counts`` counts or are expressed in at
    least ``min_cells`` cells or have at most ``max_counts`` counts or are expressed
    in at most ``max_cells`` cells.

    Only provide one of the optional parameters ``min_counts``, ``min_cells``,
    ``max_counts``, ``max_cells`` per call.

    Parameters
    ----------
    data
        An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    min_counts
        Minimum number of counts required for a gene to pass filtering.
    min_cells
        Minimum number of cells expressed required for a gene to pass filtering.
    max_counts
        Maximum number of counts required for a gene to pass filtering.
    max_cells
        Maximum number of cells expressed required for a gene to pass filtering.
    inplace
        Perform computation inplace or return result.

    Returns
    -------
    `tuple`, `None`
        Depending on `inplace`, returns the following arrays or directly subsets
        and annotates the data matrix

        gene_subset : :class:`~numpy.ndarray`
            Boolean index mask that does filtering. `True` means that the
            gene is kept. `False` means the gene is removed.
        number_per_gene : :class:`~numpy.ndarray`
            Depending on what was tresholded (`counts` or `cells`), the array stores
            `n_counts` or `n_cells` per gene.
    """
    if copy:
       logg.warn('`copy` is deprecated, use `inplace` instead.')
    n_given_options = sum(
        option is not None for option in
        [min_cells, min_counts, max_cells, max_counts])
    if n_given_options != 1:
        raise ValueError(
            'Only provide one of the optional parameters `min_counts`,'
            '`min_cells`, `max_counts`, `max_cells` per call.')

    if isinstance(data, AnnData):
        adata = data.copy() if copy else data
        gene_subset, number = materialize_as_ndarray(
            filter_genes(adata.X, min_cells=min_cells,
                         min_counts=min_counts, max_cells=max_cells,
                         max_counts=max_counts))
        if not inplace:
            return gene_subset, number
        if min_cells is None and max_cells is None:
            adata.var['n_counts'] = number
        else:
            adata.var['n_cells'] = number
        adata._inplace_subset_var(gene_subset)
        return adata if copy else None

    X = data  # proceed with processing the data matrix
    min_number = min_counts if min_cells is None else min_cells
    max_number = max_counts if max_cells is None else max_cells
    number_per_gene = np.sum(X if min_cells is None and max_cells is None
                             else X > 0, axis=0)
    if issparse(X):
        number_per_gene = number_per_gene.A1
    if min_number is not None:
        gene_subset = number_per_gene >= min_number
    if max_number is not None:
        gene_subset = number_per_gene <= max_number

    s = np.sum(~gene_subset)
    if s > 0:
        logg.info('filtered out {} genes that are detected'.format(s), end=' ')
        if min_cells is not None or min_counts is not None:
            logg.info('in less than',
                   str(min_cells) + ' cells'
                   if min_counts is None else str(min_counts) + ' counts', no_indent=True)
        if max_cells is not None or max_counts is not None:
            logg.info('in more than ',
                   str(max_cells) + ' cells'
                   if max_counts is None else str(max_counts) + ' counts', no_indent=True)
    return gene_subset, number_per_gene
Ejemplo n.º 24
0
def test_multicol():
    adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]))
    # 'c' keeps the columns as should be
    adata.obsm["c"] = np.array([[0.0, 1.0], [2, 3]])
    assert adata.obsm_keys() == ["c"]
    assert adata.obsm["c"].tolist() == [[0.0, 1.0], [2, 3]]
Ejemplo n.º 25
0
def pca(
    data: Union[AnnData, np.ndarray, spmatrix],
    n_comps: int = N_PCS,
    zero_center: Optional[bool] = True,
    svd_solver: str = 'auto',
    random_state: int = 0,
    return_info: bool = False,
    use_highly_variable: Optional[bool] = None,
    dtype: str = 'float32',
    copy: bool = False,
    chunked: bool = False,
    chunk_size: Optional[int] = None,
) -> Union[AnnData, np.ndarray, spmatrix]:
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape ``n_obs`` × ``n_vars``.
        Rows correspond to cells and columns to genes.
    n_comps
        Number of principal components to compute.
    zero_center
        If `True`, compute standard PCA from covariance matrix.
        If ``False``, omit zero-centering variables
        (uses :class:`~sklearn.decomposition.TruncatedSVD`),
        which allows to handle sparse input efficiently.
        Passing ``None`` decides automatically based on sparseness of the data.
    svd_solver
        SVD solver to use:

        ``'arpack'``
          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)

        ``'randomized'``
          for the randomized algorithm due to Halko (2009).

        ``'auto'`` (the default)
          chooses automatically depending on the size of the problem.

    random_state
        Change to use different initial states for the optimization.
    return_info
        Only relevant when not passing an :class:`~anndata.AnnData`:
        see “**Returns**”.
    use_highly_variable
        Whether to use highly variable genes only, stored in
        ``.var['highly_variable']``.
        By default uses them if they have been determined beforehand.
    dtype
        Numpy data type string to which to convert the result.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked
        If ``True``, perform an incremental PCA on segments of ``chunk_size``.
        The incremental PCA automatically zero centers and ignores settings of
        ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA.
    chunk_size
        Number of observations to include in each chunk.
        Required if ``chunked=True`` was passed.

    Returns
    -------

    X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray`
        If `data` is array-like and ``return_info=False`` was passed,
        this function only returns `X_pca`…
    adata : :class:`~anndata.AnnData`
        …otherwise if ``copy=True`` it returns or else adds fields to ``adata``:

        ``.obsm['X_pca']``
             PCA representation of data.

        ``.varm['PCs']``
             The principal components containing the loadings.

        ``.uns['pca']['variance_ratio']``)
             Ratio of explained variance.

        ``.uns['pca']['variance']``
             Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """
    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
            'become the Scanpy default in the future.')

    data_is_AnnData = isinstance(data, AnnData)
    if data_is_AnnData:
        adata = data.copy() if copy else data
    else:
        adata = AnnData(data)

    logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
               n_comps, 'as dim of data is only', adata.n_vars, v=4)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys():
        raise ValueError('Did not find adata.var[\'highly_variable\']. '
                         'Either your data already only consists of highly-variable genes '
                         'or consider running `pp.filter_genes_dispersion` first.')
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys() else False
    adata_comp = adata[:, adata.var['highly_variable']] if use_highly_variable else adata

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        if zero_center is None:
            zero_center = not issparse(adata_comp.X)
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata_comp.X):
                logg.msg('    as `zero_center=True`, '
                       'sparse input is densified and may '
                       'lead to huge memory consumption', v=4)
                X = adata_comp.X.toarray()  # Copying the whole adata_comp.X here, could cause memory problems
            else:
                X = adata_comp.X
            pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg('    without zero-centering: \n'
                   '    the explained variance does not correspond to the exact statistical defintion\n'
                   '    the first component, e.g., might be heavily influenced by different means\n'
                   '    the following components often resemble the exact PCA very closely', v=4)
            pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state)
            X = adata_comp.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.msg('    finished', t=True, end=' ', v=4)
        logg.msg('and added\n'
                 '    \'X_pca\', the PCA coordinates (adata.obs)\n'
                 '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
                 '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
                 '    \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4)
        return adata if copy else None
    else:
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca
Ejemplo n.º 26
0
def test_n_obs():
    adata = AnnData(np.array([[1, 2], [3, 4], [5, 6]]))
    assert adata.n_obs == 3
    adata1 = adata[:2]
    assert adata1.n_obs == 2
Ejemplo n.º 27
0
def leiden(
    adata: AnnData,
    resolution: float = 1,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_state: Optional[Union[int, RandomState]] = 0,
    key_added: str = 'leiden',
    adjacency: Optional[sparse.spmatrix] = None,
    directed: bool = True,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    copy: bool = False,
    **partition_kwargs,
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Traag18]_.

    Cluster cells using the Leiden algorithm [Traag18]_,
    an improved version of the Louvain algorithm [Blondel08]_.
    It has been proposed for single-cell analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        A parameter value controlling the coarseness of the clustering.
        Higher values lead to more clusters.
        Set to `None` if overriding `partition_type`
        to one that doesn’t accept a `resolution_parameter`.
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain `(obs_key, list_of_categories)`.
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']`.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    n_iterations
        How many iterations of the Leiden clustering algorithm to perform.
        Positive values above 2 define the total number of iterations to perform,
        -1 has the algorithm run until it reaches its optimal clustering.
    partition_type
        Type of partition to use.
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`.
        For the available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    copy
        Whether to copy `adata` or modify it inplace.
    **partition_kwargs
        Any further arguments to pass to `~leidenalg.find_partition`
        (which in turn passes arguments to the `partition_type`).

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['leiden']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    """
    try:
        import leidenalg
    except ImportError:
        raise ImportError(
            'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.'
        )
    partition_kwargs = dict(partition_kwargs)

    start = logg.info('running Leiden clustering')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if 'neighbors' not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = _utils.get_igraph_from_adjacency(adjacency, directed=directed)
    # flip to the default partition type if not overriden by the user
    if partition_type is None:
        partition_type = leidenalg.RBConfigurationVertexPartition
    # Prepare find_partition arguments as a dictionary,
    # appending to whatever the user provided. It needs to be this way
    # as this allows for the accounting of a None resolution
    # (in the case of a partition variant that doesn't take it on input)
    if use_weights:
        partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
    partition_kwargs['n_iterations'] = n_iterations
    partition_kwargs['seed'] = random_state
    if resolution is not None:
        partition_kwargs['resolution_parameter'] = resolution
    # clustering proper
    part = leidenalg.find_partition(g, partition_type, **partition_kwargs)
    # store output into adata.obs
    groups = np.array(part.membership)
    if restrict_to is not None:
        if key_added == 'leiden':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(np.unique(groups).astype('U')),
    )
    # store information on the clustering parameters
    adata.uns['leiden'] = {}
    adata.uns['leiden']['params'] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
    )
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'
        ),
    )
    return adata if copy else None
Ejemplo n.º 28
0
 def test_extract_data_raw_None(self, adata: AnnData):
     adata = AnnData(adata.X, raw=None)
     with pytest.raises(ValueError):
         _extract_data(adata, use_raw=True)
Ejemplo n.º 29
0
def rank_genes_groups(adata: AnnData,
                      groupby: str,
                      use_raw: bool = True,
                      groups: Union[str, Iterable[str]] = 'all',
                      reference: str = 'rest',
                      n_genes: int = 100,
                      rankby_abs: bool = False,
                      key_added: Optional[str] = None,
                      copy: bool = False,
                      method: str = 't-test_overestim_var',
                      corr_method: str = 'benjamini-hochberg',
                      **kwds):
    """Rank genes for characterizing groups.

    Parameters
    ----------
    adata
        Annotated data matrix.
    groupby
        The key of the observations grouping to consider.
    use_raw : `bool`, optional (default: `True`)
        Use `raw` attribute of `adata` if present.
    groups
        Subset of groups, e.g. `['g1', 'g2', 'g3']`, to which comparison shall
        be restricted, or `'all'` (default), for all groups.
    reference
        If `'rest'`, compare each group to the union of the rest of the group.  If
        a group identifier, compare with respect to this group.
    n_genes
        The number of genes that appear in the returned tables.
    method : `{'logreg', 't-test', 'wilcoxon', 't-test_overestim_var'}`, optional (default: 't-test_overestim_var')
        If 't-test', uses t-test, if 'wilcoxon', uses Wilcoxon-Rank-Sum. If
        't-test_overestim_var', overestimates variance of each group. If
        'logreg' uses logistic regression, see [Ntranos18]_, `here
        <https://github.com/theislab/scanpy/issues/95>`__ and `here
        <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__, for
        why this is meaningful.
    corr_method : `{'benjamini-hochberg', 'bonferroni'}`, optional (default: 'benjamini-hochberg')
        p-value correction method. Used only for 't-test', 't-test_overestim_var',
        and 'wilcoxon' methods.
    rankby_abs
        Rank genes by the absolute value of the score, not by the
        score. The returned scores are never the absolute values.
    key_added
        The key in `adata.uns` information is saved to.
    **kwds : keyword parameters
        Are passed to test methods. Currently this affects only parameters that
        are passed to `sklearn.linear_model.LogisticRegression
        <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__.
        For instance, you can pass `penalty='l1'` to try to come up with a
        minimal set of genes that are good predictors (sparse solution meaning
        few non-zero fitted coefficients).

    Returns
    -------
    **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the gene
        names. Ordered according to scores.
    **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the z-score
        underlying the computation of a p-value for each gene for each
        group. Ordered according to scores.
    **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Structured array to be indexed by group id storing the log2
        fold change for each gene for each group. Ordered according to
        scores. Only provided if method is 't-test' like.
        Note: this is an approximation calculated from mean-log values.
    **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        p-values.
    **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`)
        Corrected p-values.

    Notes
    -----
    There are slight inconsistencies depending on whether sparse
    or dense data are passed. See `here <https://github.com/theislab/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__.

    Examples
    --------
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon')

    # to visualize the results
    >>> sc.pl.rank_genes_groups(adata)
    """
    if 'only_positive' in kwds:
        rankby_abs = not kwds.pop('only_positive')  # backwards compat

    start = logg.info('ranking genes')
    avail_methods = {'t-test', 't-test_overestim_var', 'wilcoxon', 'logreg'}
    if method not in avail_methods:
        raise ValueError('Method must be one of {}.'.format(avail_methods))

    avail_corr = {'benjamini-hochberg', 'bonferroni'}
    if corr_method not in avail_corr:
        raise ValueError(
            'Correction method must be one of {}.'.format(avail_corr))

    adata = adata.copy() if copy else adata
    utils.sanitize_anndata(adata)
    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if (reference != 'rest'
            and reference not in set(adata.obs[groupby].cat.categories)):
        cats = adata.obs[groupby].cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    groups_order, groups_masks = utils.select_groups(adata, groups_order,
                                                     groupby)

    if key_added is None:
        key_added = 'rank_genes_groups'
    adata.uns[key_added] = {}
    adata.uns[key_added]['params'] = {
        'groupby': groupby,
        'reference': reference,
        'method': method,
        'use_raw': use_raw,
        'corr_method': corr_method,
    }

    # adata_comp mocks an AnnData object if use_raw is True
    # otherwise it's just the AnnData object
    adata_comp = adata
    if adata.raw is not None and use_raw:
        adata_comp = adata.raw
    X = adata_comp.X

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    if n_genes_user > X.shape[1]:
        n_genes_user = X.shape[1]
    # in the following, n_genes is simply another name for the total number of genes
    n_genes = X.shape[1]

    n_groups = groups_masks.shape[0]
    ns = np.zeros(n_groups, dtype=int)
    for imask, mask in enumerate(groups_masks):
        ns[imask] = np.where(mask)[0].size
    logg.debug(f'consider {groupby!r} groups:')
    logg.debug(f'with sizes: {ns}')
    if reference != 'rest':
        ireference = np.where(groups_order == reference)[0][0]
    reference_indices = np.arange(adata_comp.n_vars, dtype=int)

    rankings_gene_scores = []
    rankings_gene_names = []
    rankings_gene_logfoldchanges = []
    rankings_gene_pvals = []
    rankings_gene_pvals_adj = []

    if method in {'t-test', 't-test_overestim_var'}:
        from scipy import stats
        from statsmodels.stats.multitest import multipletests
        # loop over all masks and compute means, variances and sample numbers
        means = np.zeros((n_groups, n_genes))
        vars = np.zeros((n_groups, n_genes))

        for imask, mask in enumerate(groups_masks):
            means[imask], vars[imask] = _get_mean_var(X[mask])

        # test each either against the union of all other groups or against a
        # specific group
        for igroup in range(n_groups):
            if reference == 'rest':
                mask_rest = ~groups_masks[igroup]
            else:
                if igroup == ireference: continue
                else: mask_rest = groups_masks[ireference]
            mean_group, var_group = means[igroup], vars[igroup]
            mean_rest, var_rest = _get_mean_var(X[mask_rest])

            ns_group = ns[igroup]  # number of observations in group
            if method == 't-test': ns_rest = np.where(mask_rest)[0].size
            elif method == 't-test_overestim_var':
                ns_rest = ns[
                    igroup]  # hack for overestimating the variance for small groups
            else:
                raise ValueError('Method does not exist.')

            # TODO: Come up with better solution. Mask unexpressed genes?
            # See https://github.com/scipy/scipy/issues/10269
            with np.errstate(invalid="ignore"):
                scores, pvals = stats.ttest_ind_from_stats(
                    mean1=mean_group,
                    std1=np.sqrt(var_group),
                    nobs1=ns_group,
                    mean2=mean_rest,
                    std2=np.sqrt(var_rest),
                    nobs2=ns_rest,
                    equal_var=False  # Welch's
                )

            # Fold change
            foldchanges = (np.expm1(mean_group) + 1e-9) / (
                np.expm1(mean_rest) + 1e-9)  # add small value to remove 0's

            scores[np.isnan(
                scores
            )] = 0  # I think it's only nan when means are the same and vars are 0
            pvals[np.isnan(
                pvals)] = 1  # This also has to happen for Benjamini Hochberg

            if corr_method == 'benjamini-hochberg':
                _, pvals_adj, _, _ = multipletests(pvals,
                                                   alpha=0.05,
                                                   method='fdr_bh')
            elif corr_method == 'bonferroni':
                pvals_adj = np.minimum(pvals * n_genes, 1.0)

            scores_sort = np.abs(scores) if rankby_abs else scores
            partition = np.argpartition(scores_sort,
                                        -n_genes_user)[-n_genes_user:]
            partial_indices = np.argsort(scores_sort[partition])[::-1]
            global_indices = reference_indices[partition][partial_indices]
            rankings_gene_scores.append(scores[global_indices])
            rankings_gene_logfoldchanges.append(
                np.log2(foldchanges[global_indices]))
            rankings_gene_names.append(adata_comp.var_names[global_indices])
            rankings_gene_pvals.append(pvals[global_indices])
            rankings_gene_pvals_adj.append(pvals_adj[global_indices])

    elif method == 'logreg':
        # if reference is not set, then the groups listed will be compared to the rest
        # if reference is set, then the groups listed will be compared only to the other groups listed
        from sklearn.linear_model import LogisticRegression
        reference = groups_order[0]
        if len(groups) == 1:
            raise Exception(
                'Cannot perform logistic regression on a single cluster.')
        adata_copy = adata[adata.obs[groupby].isin(groups_order)]
        adata_comp = adata_copy
        if adata.raw is not None and use_raw:
            adata_comp = adata_copy.raw
        X = adata_comp.X

        clf = LogisticRegression(**kwds)
        clf.fit(X, adata_copy.obs[groupby].cat.codes)
        scores_all = clf.coef_
        for igroup, group in enumerate(groups_order):
            if len(groups_order) <= 2:  # binary logistic regression
                scores = scores_all[0]
            else:
                scores = scores_all[igroup]
            partition = np.argpartition(scores, -n_genes_user)[-n_genes_user:]
            partial_indices = np.argsort(scores[partition])[::-1]
            global_indices = reference_indices[partition][partial_indices]
            rankings_gene_scores.append(scores[global_indices])
            rankings_gene_names.append(adata_comp.var_names[global_indices])
            if len(groups_order) <= 2:
                break

    elif method == 'wilcoxon':
        from scipy import stats
        from statsmodels.stats.multitest import multipletests
        CONST_MAX_SIZE = 10000000
        means = np.zeros((n_groups, n_genes))
        vars = np.zeros((n_groups, n_genes))
        # initialize space for z-scores
        scores = np.zeros(n_genes)
        # First loop: Loop over all genes
        if reference != 'rest':
            for imask, mask in enumerate(groups_masks):
                means[imask], vars[imask] = _get_mean_var(
                    X[mask])  # for fold-change only

                if imask == ireference: continue

                else: mask_rest = groups_masks[ireference]
                ns_rest = np.where(mask_rest)[0].size
                mean_rest, var_rest = _get_mean_var(
                    X[mask_rest])  # for fold-change only

                if ns_rest <= 25 or ns[imask] <= 25:
                    logg.hint(
                        'Few observations in a group for '
                        'normal approximation (<=25). Lower test accuracy.')
                n_active = ns[imask]
                m_active = ns_rest

                # Now calculate gene expression ranking in chunkes:
                chunk = []
                # Calculate chunk frames
                n_genes_max_chunk = floor(CONST_MAX_SIZE /
                                          (n_active + m_active))
                if n_genes_max_chunk < n_genes:
                    chunk_index = n_genes_max_chunk
                    while chunk_index < n_genes:
                        chunk.append(chunk_index)
                        chunk_index = chunk_index + n_genes_max_chunk
                    chunk.append(n_genes)
                else:
                    chunk.append(n_genes)

                left = 0
                # Calculate rank sums for each chunk for the current mask
                for chunk_index, right in enumerate(chunk):
                    # Check if issparse is true: AnnData objects are currently sparse.csr or ndarray.
                    if issparse(X):
                        df1 = pd.DataFrame(data=X[mask, left:right].todense())
                        df2 = pd.DataFrame(
                            data=X[mask_rest, left:right].todense(),
                            index=np.arange(start=n_active,
                                            stop=n_active + m_active))
                    else:
                        df1 = pd.DataFrame(data=X[mask, left:right])
                        df2 = pd.DataFrame(data=X[mask_rest, left:right],
                                           index=np.arange(start=n_active,
                                                           stop=n_active +
                                                           m_active))
                    df1 = df1.append(df2)
                    ranks = df1.rank()
                    # sum up adjusted_ranks to calculate W_m,n
                    scores[left:right] = np.sum(ranks.loc[0:n_active, :])
                    left = right

                scores = (scores - (n_active *
                                    (n_active + m_active + 1) / 2)) / sqrt(
                                        (n_active * m_active *
                                         (n_active + m_active + 1) / 12))
                scores[np.isnan(scores)] = 0
                pvals = 2 * stats.distributions.norm.sf(np.abs(scores))

                if corr_method == 'benjamini-hochberg':
                    pvals[np.isnan(
                        pvals
                    )] = 1  # set Nan values to 1 to properly convert using Benhjamini Hochberg
                    _, pvals_adj, _, _ = multipletests(pvals,
                                                       alpha=0.05,
                                                       method='fdr_bh')
                elif corr_method == 'bonferroni':
                    pvals_adj = np.minimum(pvals * n_genes, 1.0)

                # Fold change
                foldchanges = (np.expm1(means[imask]) + 1e-9) / (
                    np.expm1(mean_rest) + 1e-9
                )  # add small value to remove 0's
                scores_sort = np.abs(scores) if rankby_abs else scores
                partition = np.argpartition(scores_sort,
                                            -n_genes_user)[-n_genes_user:]
                partial_indices = np.argsort(scores_sort[partition])[::-1]
                global_indices = reference_indices[partition][partial_indices]
                rankings_gene_scores.append(scores[global_indices])
                rankings_gene_names.append(
                    adata_comp.var_names[global_indices])
                rankings_gene_logfoldchanges.append(
                    np.log2(foldchanges[global_indices]))
                rankings_gene_pvals.append(pvals[global_indices])
                rankings_gene_pvals_adj.append(pvals_adj[global_indices])

        # If no reference group exists, ranking needs only to be done once (full mask)
        else:
            scores = np.zeros((n_groups, n_genes))
            chunk = []
            n_cells = X.shape[0]
            n_genes_max_chunk = floor(CONST_MAX_SIZE / n_cells)
            if n_genes_max_chunk < n_genes:
                chunk_index = n_genes_max_chunk
                while chunk_index < n_genes:
                    chunk.append(chunk_index)
                    chunk_index = chunk_index + n_genes_max_chunk
                chunk.append(n_genes)
            else:
                chunk.append(n_genes)
            left = 0
            for chunk_index, right in enumerate(chunk):
                # Check if issparse is true
                if issparse(X):
                    df1 = pd.DataFrame(data=X[:, left:right].todense())
                else:
                    df1 = pd.DataFrame(data=X[:, left:right])
                ranks = df1.rank()
                # sum up adjusted_ranks to calculate W_m,n
                for imask, mask in enumerate(groups_masks):
                    scores[imask, left:right] = np.sum(ranks.loc[mask, :])
                left = right

            for imask, mask in enumerate(groups_masks):
                mask_rest = ~groups_masks[imask]
                means[imask], vars[imask] = _get_mean_var(
                    X[mask])  #for fold-change
                mean_rest, var_rest = _get_mean_var(
                    X[mask_rest])  # for fold-change

                scores[imask, :] = (scores[imask, :] -
                                    (ns[imask] * (n_cells + 1) / 2)) / sqrt(
                                        (ns[imask] * (n_cells - ns[imask]) *
                                         (n_cells + 1) / 12))
                scores[np.isnan(scores)] = 0
                pvals = 2 * stats.distributions.norm.sf(
                    np.abs(scores[imask, :]))

                if corr_method == 'benjamini-hochberg':
                    pvals[np.isnan(
                        pvals
                    )] = 1  # set Nan values to 1 to properly convert using Benhjamini Hochberg
                    _, pvals_adj, _, _ = multipletests(pvals,
                                                       alpha=0.05,
                                                       method='fdr_bh')
                elif corr_method == 'bonferroni':
                    pvals_adj = np.minimum(pvals * n_genes, 1.0)

                # Fold change
                foldchanges = (np.expm1(means[imask]) + 1e-9) / (
                    np.expm1(mean_rest) + 1e-9
                )  # add small value to remove 0's
                scores_sort = np.abs(scores) if rankby_abs else scores
                partition = np.argpartition(scores_sort[imask, :],
                                            -n_genes_user)[-n_genes_user:]
                partial_indices = np.argsort(scores_sort[imask,
                                                         partition])[::-1]
                global_indices = reference_indices[partition][partial_indices]
                rankings_gene_scores.append(scores[imask, global_indices])
                rankings_gene_names.append(
                    adata_comp.var_names[global_indices])
                rankings_gene_logfoldchanges.append(
                    np.log2(foldchanges[global_indices]))
                rankings_gene_pvals.append(pvals[global_indices])
                rankings_gene_pvals_adj.append(pvals_adj[global_indices])

    groups_order_save = [str(g) for g in groups_order]
    if (reference != 'rest' and method != 'logreg') or (method == 'logreg'
                                                        and len(groups) == 2):
        groups_order_save = [g for g in groups_order if g != reference]
    adata.uns[key_added]['scores'] = np.rec.fromarrays(
        [n for n in rankings_gene_scores],
        dtype=[(rn, 'float32') for rn in groups_order_save])
    adata.uns[key_added]['names'] = np.rec.fromarrays(
        [n for n in rankings_gene_names],
        dtype=[(rn, 'U50') for rn in groups_order_save])

    if method in {'t-test', 't-test_overestim_var', 'wilcoxon'}:
        adata.uns[key_added]['logfoldchanges'] = np.rec.fromarrays(
            [n for n in rankings_gene_logfoldchanges],
            dtype=[(rn, 'float32') for rn in groups_order_save])
        adata.uns[key_added]['pvals'] = np.rec.fromarrays(
            [n for n in rankings_gene_pvals],
            dtype=[(rn, 'float64') for rn in groups_order_save])
        adata.uns[key_added]['pvals_adj'] = np.rec.fromarrays(
            [n for n in rankings_gene_pvals_adj],
            dtype=[(rn, 'float64') for rn in groups_order_save])
    logg.info(
        '    finished',
        time=start,
        deep=
        (f'added to `.uns[{key_added!r}]`\n'
         "    'names', sorted np.recarray to be indexed by group ids\n"
         "    'scores', sorted np.recarray to be indexed by group ids\n" +
         ("    'logfoldchanges', sorted np.recarray to be indexed by group ids\n"
          "    'pvals', sorted np.recarray to be indexed by group ids\n"
          "    'pvals_adj', sorted np.recarray to be indexed by group ids" if
          method in {'t-test', 't-test_overestim_var', 'wilcoxon'} else '')),
    )
    return adata if copy else None
Ejemplo n.º 30
0
def gen_adata(
    shape: Tuple[int, int],
    X_type=sparse.csr_matrix,
    X_dtype=np.float32,
    # obs_dtypes,
    # var_dtypes,
    obsm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray,
                                      pd.DataFrame),
    varm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray,
                                      pd.DataFrame),
    layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray,
                                        pd.DataFrame),
) -> AnnData:
    """\
    Helper function to generate a random AnnData for testing purposes.

    Note: For `obsm_types`, `varm_types`, and `layers_types` these currently
    just filter already created objects.
    In future, these should choose which objects are created.

    Params
    ------
    shape
        What shape you want the anndata to be.
    X_type
        What kind of container should `X` be? This will be called on a randomly
        generated 2d array.
    X_dtype
        What should the dtype of the `.X` container be?
    obsm_types
        What kinds of containers should be in `.obsm`?
    varm_types
        What kinds of containers should be in `.varm`?
    layers_types
        What kinds of containers should be in `.layers`?
    """
    M, N = shape
    obs_names = pd.Index(f"cell{i}" for i in range(shape[0]))
    var_names = pd.Index(f"gene{i}" for i in range(shape[1]))
    obs = gen_typed_df(M, obs_names)
    var = gen_typed_df(N, var_names)
    # For #147
    obs.rename(columns=dict(cat="obs_cat"), inplace=True)
    var.rename(columns=dict(cat="var_cat"), inplace=True)

    if X_type is None:
        X = None
    else:
        X = X_type(np.random.binomial(100, 0.005, (M, N)).astype(X_dtype))
    obsm = dict(
        array=np.random.random((M, 50)),
        sparse=sparse.random(M, 100, format="csr"),
        df=gen_typed_df(M, obs_names),
    )
    obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types}
    varm = dict(
        array=np.random.random((N, 50)),
        sparse=sparse.random(N, 100, format="csr"),
        df=gen_typed_df(N, var_names),
    )
    varm = {k: v for k, v in varm.items() if type(v) in varm_types}
    layers = dict(array=np.random.random((M, N)),
                  sparse=sparse.random(M, N, format="csr"))
    layers = {k: v for k, v in layers.items() if type(v) in layers_types}
    obsp = dict(array=np.random.random((M, M)),
                sparse=sparse.random(M, M, format="csr"))
    varp = dict(array=np.random.random((N, N)),
                sparse=sparse.random(N, N, format="csr"))
    uns = dict(
        O_recarray=gen_vstr_recarray(N, 5),
        nested=dict(
            scalar_str="str",
            scalar_int=42,
            scalar_float=3.0,
            nested_further=dict(array=np.arange(5)),
        ),
        # U_recarray=gen_vstr_recarray(N, 5, "U4")
    )
    adata = AnnData(
        X=X,
        obs=obs,
        var=var,
        obsm=obsm,
        varm=varm,
        layers=layers,
        obsp=obsp,
        varp=varp,
        dtype=X_dtype,
        uns=uns,
    )
    return adata
Ejemplo n.º 31
0
def test_multicol():
    adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]))
    # 'c' keeps the columns as should be
    adata.obsm['c'] = np.array([[0., 1.], [2, 3]])
    assert adata.obsm_keys() == ['c']
    assert adata.obsm['c'].tolist() == [[0., 1.], [2, 3]]
Ejemplo n.º 32
0
def marker_gene_overlap(
    adata: AnnData,
    reference_markers: Union[Dict[str, set], Dict[str, list]],
    *,
    key: str = 'rank_genes_groups',
    method: _Method = 'overlap_count',
    normalize: Optional[Literal['reference', 'data']] = None,
    top_n_markers: Optional[int] = None,
    adj_pval_threshold: Optional[float] = None,
    key_added: str = 'marker_gene_overlap',
    inplace: bool = False,
):
    """\
    Calculate an overlap score between data-deriven marker genes and
    provided markers

    Marker gene overlap scores can be quoted as overlap counts, overlap
    coefficients, or jaccard indices. The method returns a pandas dataframe
    which can be used to annotate clusters based on marker gene overlaps.

    This function was written by Malte Luecken.

    Parameters
    ----------
    adata
        The annotated data matrix.
    reference_markers
        A marker gene dictionary object. Keys should be strings with the
        cell identity name and values are sets or lists of strings which match
        format of `adata.var_name`.
    key
        The key in `adata.uns` where the rank_genes_groups output is stored.
        By default this is `'rank_genes_groups'`.
    method
        (default: `overlap_count`)
        Method to calculate marker gene overlap. `'overlap_count'` uses the
        intersection of the gene set, `'overlap_coef'` uses the overlap
        coefficient, and `'jaccard'` uses the Jaccard index.
    normalize
        Normalization option for the marker gene overlap output. This parameter
        can only be set when `method` is set to `'overlap_count'`. `'reference'`
        normalizes the data by the total number of marker genes given in the
        reference annotation per group. `'data'` normalizes the data by the
        total number of marker genes used for each cluster.
    top_n_markers
        The number of top data-derived marker genes to use. By default the top
        100 marker genes are used. If `adj_pval_threshold` is set along with
        `top_n_markers`, then `adj_pval_threshold` is ignored.
    adj_pval_threshold
        A significance threshold on the adjusted p-values to select marker
        genes. This can only be used when adjusted p-values are calculated by
        `sc.tl.rank_genes_groups()`. If `adj_pval_threshold` is set along with
        `top_n_markers`, then `adj_pval_threshold` is ignored.
    key_added
        Name of the `.uns` field that will contain the marker overlap scores.
    inplace
        Return a marker gene dataframe or store it inplace in `adata.uns`.

    Returns
    -------
    A pandas dataframe with the marker gene overlap scores if `inplace=False`.
    For `inplace=True` `adata.uns` is updated with an additional field
    specified by the `key_added` parameter (default = 'marker_gene_overlap').

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.pp.pca(adata, svd_solver='arpack')
    >>> sc.pp.neighbors(adata)
    >>> sc.tl.louvain(adata)
    >>> sc.tl.rank_genes_groups(adata, groupby='louvain')
    >>> marker_genes = {
    ...     'CD4 T cells': {'IL7R'},
    ...     'CD14+ Monocytes': {'CD14', 'LYZ'},
    ...     'B cells': {'MS4A1'},
    ...     'CD8 T cells': {'CD8A'},
    ...     'NK cells': {'GNLY', 'NKG7'},
    ...     'FCGR3A+ Monocytes': {'FCGR3A', 'MS4A7'},
    ...     'Dendritic Cells': {'FCER1A', 'CST3'},
    ...     'Megakaryocytes': {'PPBP'}
    ... }
    >>> marker_matches = sc.tl.marker_gene_overlap(adata, marker_genes)
    """
    # Test user inputs
    if inplace:
        raise NotImplementedError(
            'Writing Pandas dataframes to h5ad is currently under development.'
            '\nPlease use `inplace=False`.')

    if key not in adata.uns:
        raise ValueError('Could not find marker gene data. '
                         'Please run `sc.tl.rank_genes_groups()` first.')

    avail_methods = {'overlap_count', 'overlap_coef', 'jaccard', 'enrich'}
    if method not in avail_methods:
        raise ValueError(f'Method must be one of {avail_methods}.')

    if normalize == 'None':
        normalize = None

    avail_norm = {'reference', 'data', None}
    if normalize not in avail_norm:
        raise ValueError(f'Normalize must be one of {avail_norm}.')

    if normalize is not None and method != 'overlap_count':
        raise ValueError('Can only normalize with method=`overlap_count`.')

    if not all(
            isinstance(val, cabc.Set) for val in reference_markers.values()):
        try:
            reference_markers = {
                key: set(val)
                for key, val in reference_markers.items()
            }
        except Exception:
            raise ValueError('Please ensure that `reference_markers` contains '
                             'sets or lists of markers as values.')

    if adj_pval_threshold is not None:
        if 'pvals_adj' not in adata.uns[key]:
            raise ValueError('Could not find adjusted p-value data. '
                             'Please run `sc.tl.rank_genes_groups()` with a '
                             'method that outputs adjusted p-values.')

        if adj_pval_threshold < 0:
            logg.warning(
                '`adj_pval_threshold` was set below 0. Threshold will be set to 0.'
            )
            adj_pval_threshold = 0
        elif adj_pval_threshold > 1:
            logg.warning(
                '`adj_pval_threshold` was set above 1. Threshold will be set to 1.'
            )
            adj_pval_threshold = 1

        if top_n_markers is not None:
            logg.warning(
                'Both `adj_pval_threshold` and `top_n_markers` is set. '
                '`adj_pval_threshold` will be ignored.')

    if top_n_markers is not None and top_n_markers < 1:
        logg.warning(
            '`top_n_markers` was set below 1. `top_n_markers` will be set to 1.'
        )
        top_n_markers = 1

    # Get data-derived marker genes in a dictionary of sets
    data_markers = dict()
    cluster_ids = adata.uns[key]['names'].dtype.names

    for group in cluster_ids:
        if top_n_markers is not None:
            n_genes = min(top_n_markers, adata.uns[key]['names'].shape[0])
            data_markers[group] = set(adata.uns[key]['names'][group][:n_genes])
        elif adj_pval_threshold is not None:
            n_genes = (adata.uns[key]['pvals_adj'][group] <
                       adj_pval_threshold).sum()
            data_markers[group] = set(adata.uns[key]['names'][group][:n_genes])
            if n_genes == 0:
                logg.warning(
                    'No marker genes passed the significance threshold of '
                    f'{adj_pval_threshold} for cluster {group!r}.')
        # Use top 100 markers as default if top_n_markers = None
        else:
            data_markers[group] = set(adata.uns[key]['names'][group][:100])

    # Find overlaps
    if method == 'overlap_count':
        marker_match = _calc_overlap_count(reference_markers, data_markers)
        if normalize == 'reference':
            # Ensure rows sum to 1
            ref_lengths = np.array([
                len(reference_markers[m_group])
                for m_group in reference_markers
            ])
            marker_match = marker_match / ref_lengths[:, np.newaxis]
            marker_match = np.nan_to_num(marker_match)
        elif normalize == 'data':
            # Ensure columns sum to 1
            data_lengths = np.array(
                [len(data_markers[dat_group]) for dat_group in data_markers])
            marker_match = marker_match / data_lengths
            marker_match = np.nan_to_num(marker_match)
    elif method == 'overlap_coef':
        marker_match = _calc_overlap_coef(reference_markers, data_markers)
    elif method == 'jaccard':
        marker_match = _calc_jaccard(reference_markers, data_markers)

    # Note:
    # Could add an 'enrich' option here
    # (fisher's exact test or hypergeometric test),
    # but that would require knowledge of the size of the space from which
    # the reference marker gene set was taken.
    # This is at best approximately known.

    # Create a pandas dataframe with the results
    marker_groups = list(reference_markers.keys())
    clusters = list(cluster_ids)
    marker_matching_df = pd.DataFrame(marker_match,
                                      index=marker_groups,
                                      columns=clusters)

    # Store the results
    if inplace:
        adata.uns[key_added] = marker_matching_df
        logg.hint(
            f'added\n    {key_added!r}, marker overlap scores (adata.uns)')
    else:
        return marker_matching_df
Ejemplo n.º 33
0
def test_concatenate_dense():
    # dense data
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]),
        obsm=dict(X_1=X1, X_2=X2, X_3=X3),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]),
        obsm=dict(X_1=X1, X_2=X2, X_3=X3),
        layers={"Xs": X2},
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]),
        dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]),
        obsm=dict(X_1=X1, X_2=X2),
        layers=dict(Xs=X3),
    )

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]
    assert adata.X.astype(int).tolist() == X_combined
    assert adata.layers["Xs"].astype(int).tolist() == X_combined
    assert adata.obs_keys() == ["anno1", "anno2", "batch"]
    assert adata.var_keys() == ["annoA-0", "annoA-1", "annoB-2"]
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    assert adata.obsm_keys() == ["X_1", "X_2"]
    assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist()

    # with batch_key and batch_categories
    adata = adata1.concatenate(adata2, adata3, batch_key="batch1")
    assert adata.obs_keys() == ["anno1", "anno2", "batch1"]
    adata = adata1.concatenate(adata2,
                               adata3,
                               batch_categories=["a1", "a2", "a3"])
    assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"]
    assert adata.var_names.tolist() == ["b", "c"]

    # outer join
    adata = adata1.concatenate(adata2, adata3, join="outer")
    from numpy import ma

    Xma = ma.masked_invalid(adata.X)
    Xma_ref = ma.masked_invalid(
        np.array([
            [1.0, 2.0, 3.0, np.nan],
            [4.0, 5.0, 6.0, np.nan],
            [np.nan, 3.0, 2.0, 1.0],
            [np.nan, 6.0, 5.0, 4.0],
            [np.nan, 3.0, 2.0, 1.0],
            [np.nan, 6.0, 5.0, 4.0],
        ]))
    assert np.array_equal(Xma.mask, Xma_ref.mask)
    assert np.allclose(Xma.compressed(), Xma_ref.compressed())
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(
        np.array([
            [0.0, np.nan, np.nan],
            [1.0, 2.0, 2.0],
            [2.0, 1.0, 1.0],
            [np.nan, 0.0, 0.0],
        ]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())
Ejemplo n.º 34
0
Archivo: _model.py Proyecto: vals/scVI
    def from_scvi_model(
        cls,
        scvi_model: SCVI,
        adata: Optional[AnnData] = None,
        restrict_to_batch: Optional[str] = None,
    ):
        """
        Instantiate a SOLO model from an scvi model.

        Parameters
        ----------
        scvi_model
            Pre-trained model of :class:`~scvi.model.SCVI`. The
            adata object used to initialize this model should have only
            been setup with count data, and optionally a `batch_key`;
            i.e., no extra covariates or labels, etc.
        adata
            Optional anndata to use that is compatible with scvi_model.
        restrict_to_batch
            Batch category in `batch_key` used to setup adata for scvi_model
            to restrict Solo model to. This allows to train a Solo model on
            one batch of a scvi_model that was trained on multiple batches.

        Returns
        -------
        SOLO model
        """
        _validate_scvi_model(scvi_model, restrict_to_batch=restrict_to_batch)
        orig_adata = scvi_model.adata
        orig_batch_key = scvi_model.scvi_setup_dict_["categorical_mappings"][
            "_scvi_batch"
        ]["original_key"]

        if restrict_to_batch is not None:
            batch_mask = orig_adata.obs[orig_batch_key] == restrict_to_batch
            if np.sum(batch_mask) == 0:
                raise ValueError(
                    "Batch category given to restrict_to_batch not found.\n"
                    + "Available categories: {}".format(
                        orig_adata.obs[orig_batch_key].astype("category").cat.categories
                    )
                )
            # indices in adata with restrict_to_batch category
            batch_indices = np.where(batch_mask)[0]
        else:
            # use all indices
            batch_indices = None

        # anndata with only generated doublets
        doublet_adata = cls.create_doublets(orig_adata, indices=batch_indices)
        # if scvi wasn't trained with batch correction having the
        # zeros here does nothing.
        doublet_adata.obs[orig_batch_key] = (
            restrict_to_batch if restrict_to_batch is not None else 0
        )

        # if model is using observed lib size, needs to get lib sample
        # which is just observed lib size on log scale
        give_mean_lib = not scvi_model.module.use_observed_lib_size

        # get latent representations and make input anndata
        latent_rep = scvi_model.get_latent_representation(
            orig_adata, indices=batch_indices
        )
        lib_size = scvi_model.get_latent_library_size(
            orig_adata, indices=batch_indices, give_mean=give_mean_lib
        )
        latent_adata = AnnData(np.concatenate([latent_rep, lib_size], axis=1))
        latent_adata.obs[LABELS_KEY] = "singlet"
        orig_obs_names = orig_adata.obs_names
        latent_adata.obs_names = (
            orig_obs_names[batch_indices]
            if batch_indices is not None
            else orig_obs_names
        )

        logger.info("Creating doublets, preparing SOLO model.")
        f = io.StringIO()
        with redirect_stdout(f):
            setup_anndata(doublet_adata, batch_key=orig_batch_key)
            doublet_latent_rep = scvi_model.get_latent_representation(doublet_adata)
            doublet_lib_size = scvi_model.get_latent_library_size(
                doublet_adata, give_mean=give_mean_lib
            )
            doublet_adata = AnnData(
                np.concatenate([doublet_latent_rep, doublet_lib_size], axis=1)
            )
            doublet_adata.obs[LABELS_KEY] = "doublet"

            full_adata = latent_adata.concatenate(doublet_adata)
            setup_anndata(full_adata, labels_key=LABELS_KEY)
        return cls(full_adata)
Ejemplo n.º 35
0
def test_concatenate():
    # dense data
    adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno1': ['c1', 'c2']},
                     {'var_names': ['a', 'b', 'c'],
                      'annoA': [0, 1, 2]})
    adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s3', 's4'],
                      'anno1': ['c3', 'c4']},
                     {'var_names': ['d', 'c', 'b'],
                      'annoA': [0, 1, 2]})
    adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno2': ['d3', 'd4']},
                     {'var_names': ['d', 'c', 'b'],
                      'annoB': [0, 1, 2]})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch']
    assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2']
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    adata = adata1.concatenate(adata2, adata3, batch_key='batch1')
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch1']
    adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3'])
    assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3']
    assert adata.var_names.tolist() == ['b', 'c']

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    from numpy import ma
    Xma = ma.masked_invalid(adata.X)
    Xma_ref = ma.masked_invalid(np.array([
        [1.0, 2.0, 3.0, np.nan],
        [4.0, 5.0, 6.0, np.nan],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0]]))
    assert np.array_equal(Xma.mask, Xma_ref.mask)
    assert np.allclose(Xma.compressed(), Xma_ref.compressed())
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(np.array(
        [[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())

    # sparse data
    from scipy.sparse import csr_matrix
    adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno1': ['c1', 'c2']},
                     {'var_names': ['a', 'b', 'c']})
    adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
                     {'obs_names': ['s3', 's4'],
                      'anno1': ['c3', 'c4']},
                     {'var_names': ['d', 'c', 'b']})
    adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]),
                     {'obs_names': ['s5', 's6'],
                      'anno2': ['d3', 'd4']},
                     {'var_names': ['d', 'c', 'b']})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]]

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    assert adata.X.toarray().tolist() == [
        [0.0, 2.0, 3.0, 0.0],
        [0.0, 5.0, 6.0, 0.0],
        [0.0, 3.0, 2.0, 0.0],
        [0.0, 6.0, 5.0, 0.0],
        [0.0, 0.0, 2.0, 1.0],
        [0.0, 6.0, 5.0, 0.0]]
Ejemplo n.º 36
0
def test_concatenate_with_raw():
    # dense data
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    X4 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]),
        layers=dict(Xs=X2),
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]),
        dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]),
        layers=dict(Xs=X3),
    )

    adata4 = AnnData(
        X4,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c", "z"], annoA=[0, 1, 2, 3]),
        layers=dict(Xs=X4),
    )

    adata1.raw = adata1
    adata2.raw = adata2
    adata3.raw = adata3

    adata_all = AnnData.concatenate(adata1, adata2, adata3)
    assert isinstance(adata_all.raw, Raw)
    assert set(adata_all.raw.var_names) == {"b", "c"}
    assert_equal(adata_all.raw.to_adata().obs, adata_all.obs)
    assert np.array_equal(adata_all.raw.X, adata_all.X)

    adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer")
    assert isinstance(adata_all.raw, Raw)
    assert set(adata_all.raw.var_names) == set("abcd")
    assert_equal(adata_all.raw.to_adata().obs, adata_all.obs)
    assert np.array_equal(np.nan_to_num(adata_all.raw.X),
                          np.nan_to_num(adata_all.X))

    adata3.raw = adata4
    adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer")
    assert isinstance(adata_all.raw, Raw)
    assert set(adata_all.raw.var_names) == set("abcdz")
    assert set(adata_all.var_names) == set("abcd")
    assert not np.array_equal(np.nan_to_num(adata_all.raw.X),
                              np.nan_to_num(adata_all.X))

    del adata3.raw
    with pytest.warns(
            UserWarning,
            match=("Only some adata objects have `.raw` attribute, "
                   "not concatenating `.raw` attributes."),
    ):
        adata_all = AnnData.concatenate(adata1, adata2, adata3)
    assert adata_all.raw is None

    del adata1.raw
    del adata2.raw
    assert all(_adata.raw is None for _adata in (adata1, adata2, adata3))
    adata_all = AnnData.concatenate(adata1, adata2, adata3)
    assert adata_all.raw is None
Ejemplo n.º 37
0
def test_pickle():
    import pickle

    adata = AnnData()
    adata2 = pickle.loads(pickle.dumps(adata))
    assert adata2.obsm.parent is adata2
Ejemplo n.º 38
0
    if expr_type == 'harmony':
        X = correct_harmony(all_dimreds)
    if expr_type == 'scanorama':
        X = correct_scanorama(Xs, genes)
    if expr_type == 'scvi':
        nonzero_idx = np.array(X.sum(1) > 0).flatten()
        X = np.zeros((X.shape[0], 30))
        X_scvi = correct_scvi(Xs, genes)
        X[nonzero_idx, :] = X_scvi
        X[np.isnan(X)] = 0
        X[np.isinf(X)] = 0

    C = np.vstack([X[sample_idx].mean(0) for sample_idx in sample_idxs])

    adata = AnnData(X=C)
    adata.obs['study'] = studies

    for knn in [15, 20, 30, 40]:
        sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X')
        draw_graph(adata, layout='fa')
        sc.pl.draw_graph(adata,
                         color='study',
                         edges=True,
                         edges_color='#CCCCCC',
                         save='_{}_expr_gmean_k{}.png'.format(
                             NAMESPACE + '_' + expr_type, knn))
        sys.stdout.flush()

    adata = AnnData(X=X)
    adata.obs['study'] = ['_'.join(ct.split('_')[:3]) for ct in cell_types]