Beispiel #1
0
def _regress_out_chunk(data):
    # data is a tuple containing the selected columns from adata.X
    # and the regressors dataFrame
    data_chunk = data[0]
    regressors = data[1]
    variable_is_categorical = data[2]

    responses_chunk_list = []
    import statsmodels.api as sm
    from statsmodels.tools.sm_exceptions import PerfectSeparationError

    for col_index in range(data_chunk.shape[1]):
        if variable_is_categorical:
            regres = np.c_[np.ones(regressors.shape[0]), regressors[:,
                                                                    col_index]]
        else:
            regres = regressors
        try:
            result = sm.GLM(data_chunk[:, col_index],
                            regres,
                            family=sm.families.Gaussian()).fit()
            new_column = result.resid_response
        except PerfectSeparationError:  # this emulates R's behavior
            logg.warn(
                'Encountered PerfectSeparationError, setting to 0 as in R.')
            new_column = np.zeros(data_chunk.shape[0])

        responses_chunk_list.append(new_column)

    return np.vstack(responses_chunk_list)
Beispiel #2
0
def downsample_counts(adata, target_counts=20000, random_state=0, copy=False):
    """Downsample counts so that each cell has no more than `target_counts`.

    Cells with fewer counts than `target_counts` are unaffected by this. This
    has been implemented by M. D. Luecken.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    target_counts : `int` (default: 20,000)
        Target number of counts for downsampling. Cells with more counts than
        'target_counts' will be downsampled to have 'target_counts' counts.
    random_state : `int` or `None`, optional (default: 0)
        Random seed to change subsampling.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Depending on `copy` returns or updates an `adata` with downsampled `.X`.
    """
    if target_counts < 1:
        raise ValueError(
            '`target_counts` must be a positive integer'.format(target_counts))
    if not isinstance(adata, AnnData):
        raise ValueError('`adata` must be an `AnnData` object'.format(adata))
    logg.msg('downsampling to {} counts'.format(target_counts), r=True)
    adata = adata.copy() if copy else adata
    np.random.seed(random_state)
    counts = adata.X.sum(axis=1)
    adata.obs['n_counts'] = counts
    for icell, _ in enumerate(adata.obs_names):
        if counts[icell] > target_counts:
            idx_vec = []
            for ix, i in enumerate(adata.X[icell].astype(int)):
                idx_vec.extend([ix] * i)
            # idx_vec = np.array(idx_vec)
            downsamp = np.random.choice(idx_vec, target_counts)
            cell_profile = np.zeros(adata.n_vars)
            indices, values = np.unique(downsamp, return_counts=True)
            for i in range(len(indices)):
                cell_profile[indices[i]] = values[i]
            adata.X[icell] = cell_profile
    logg.msg('finished', t=True)
    return adata if copy else None
Beispiel #3
0
def regress_out(adata, keys, n_jobs=None, copy=False):
    """Regress out unwanted sources of variation.

    Uses simple linear regression. This is inspired by Seurat's `regressOut`
    function in R [Satija15].

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        The annotated data matrix.
    keys : `str` or list of `str`
        Keys for observation annotation on which to regress on.
    n_jobs : `int` or `None`, optional. If None is given, then the n_jobs seting is used (default: `None`)
        Number of jobs for parallel computation.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Depending on `copy` returns or updates `adata` with the corrected data matrix.
    """
    logg.info('regressing out', keys, r=True)
    if issparse(adata.X):
        logg.info('    sparse input is densified and may '
                  'lead to high memory use')
    adata = adata.copy() if copy else adata
    if isinstance(keys, str):
        keys = [keys]

    if issparse(adata.X):
        adata.X = adata.X.toarray()

    n_jobs = sett.n_jobs if n_jobs is None else n_jobs

    # regress on a single categorical variable
    sanitize_anndata(adata)
    variable_is_categorical = False
    if keys[0] in adata.obs_keys() and is_categorical_dtype(
            adata.obs[keys[0]]):
        if len(keys) > 1:
            raise ValueError('If providing categorical variable, '
                             'only a single one is allowed. For this one '
                             'we regress on the mean for each category.')
        logg.msg('... regressing on per-gene means within categories')
        regressors = np.zeros(adata.X.shape, dtype='float32')
        for category in adata.obs[keys[0]].cat.categories:
            mask = (category == adata.obs[keys[0]]).values
            for ix, x in enumerate(adata.X.T):
                regressors[mask, ix] = x[mask].mean()
        variable_is_categorical = True
    # regress on one or several ordinal variables
    else:
        # create data frame with selected keys (if given)
        if keys:
            regressors = adata.obs[keys]
        else:
            regressors = adata.obs.copy()

        # add column of ones at index 0 (first column)
        regressors.insert(0, 'ones', 1.0)

    len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int)
    n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)

    tasks = []
    # split the adata.X matrix by columns in chunks of size n_chunk (the last chunk could be of smaller
    # size than the others)
    chunk_list = np.array_split(adata.X, n_chunks, axis=1)
    if variable_is_categorical:
        regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
    for idx, data_chunk in enumerate(chunk_list):
        # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and
        # the regressors. This data will be passed to each of the jobs.
        if variable_is_categorical:
            regres = regressors_chunk[idx]
        else:
            regres = regressors
        tasks.append(tuple((data_chunk, regres, variable_is_categorical)))

    if n_jobs > 1 and n_chunks > 1:
        import multiprocessing
        pool = multiprocessing.Pool(n_jobs)
        res = pool.map_async(_regress_out_chunk, tasks).get(9999999)
        pool.close()

    else:
        res = list(map(_regress_out_chunk, tasks))

    # res is a list of vectors (each corresponding to a regressed gene column).
    # The transpose is needed to get the matrix in the shape needed
    adata.X = np.vstack(res).T.astype(adata.X.dtype)
    logg.info('    finished', t=True)
    return adata if copy else None
Beispiel #4
0
def pca(data,
        n_comps=None,
        zero_center=True,
        svd_solver='auto',
        random_state=0,
        return_info=False,
        use_highly_variable=None,
        dtype='float32',
        copy=False,
        chunked=False,
        chunk_size=None):
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    n_comps : `int`, optional (default: 50)
        Number of principal components to compute.
    zero_center : `bool` or `None`, optional (default: `True`)
        If `True`, compute standard PCA from covariance matrix. If `False`, omit
        zero-centering variables (uses *TruncatedSVD* from scikit-learn), which
        allows to handle sparse input efficiently.
    svd_solver : `str`, optional (default: 'auto')
        SVD solver to use. Either 'arpack' for the ARPACK wrapper in SciPy
        (scipy.sparse.linalg.svds), or 'randomized' for the randomized algorithm
        due to Halko (2009). 'auto' chooses automatically depending on the size
        of the problem.
    random_state : `int`, optional (default: 0)
        Change to use different intial states for the optimization.
    return_info : `bool`, optional (default: `False`)
        Only relevant when not passing an :class:`~anndata.AnnData`: see
        "Returns".
    use_highly_variable : `bool`, optional (default: `None`)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    dtype : `str` (default: 'float32')
        Numpy data type string to which to convert the result.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked : `bool`, optional (default: `False`)
        If `True`, perform an incremental PCA on segments of `chunk_size`. The
        incremental PCA automatically zero centers and ignores settings of
        `random_seed` and `svd_solver`. If `False`, perform a full PCA.
    chunk_size : `int`, optional (default: `None`)
        Number of observations to include in each chunk. Required if `chunked`
        is `True`.

    Returns
    -------
    If `data` is array-like and `return_info == False`, only returns `X_pca`,\
    otherwise returns or adds to `adata`:
    X_pca : `.obsm`
         PCA representation of data.
    PCs : `.varm`
         The principal components containing the loadings.
    variance_ratio : `.uns['pca']`
         Ratio of explained variance.
    variance : `.uns['pca']`
         Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """
    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
            'become the Scanpy default in the future.')

    if n_comps is None: n_comps = N_PCS

    if isinstance(data, AnnData):
        data_is_AnnData = True
        adata = data.copy() if copy else data
    else:
        data_is_AnnData = False
        adata = AnnData(data)

    logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
                 n_comps,
                 'as dim of data is only',
                 adata.n_vars,
                 v=4)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys(
    ):
        raise ValueError(
            'Did not find adata.var[\'highly_variable\']. '
            'Either your data already only consists of highly-variable genes '
            'or consider running `pp.filter_genes_dispersion` first.')
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys(
        ) else False
    adata_comp = adata[:, adata.
                       var['highly_variable']] if use_highly_variable else adata

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        zero_center = zero_center if zero_center is not None else False if issparse(
            adata_comp.X) else True
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata_comp.X):
                logg.msg(
                    '    as `zero_center=True`, '
                    'sparse input is densified and may '
                    'lead to huge memory consumption',
                    v=4)
                X = adata_comp.X.toarray(
                )  # Copying the whole adata_comp.X here, could cause memory problems
            else:
                X = adata_comp.X
            pca_ = PCA(n_components=n_comps,
                       svd_solver=svd_solver,
                       random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg(
                '    without zero-centering: \n'
                '    the explained variance does not correspond to the exact statistical defintion\n'
                '    the first component, e.g., might be heavily influenced by different means\n'
                '    the following components often resemble the exact PCA very closely',
                v=4)
            pca_ = TruncatedSVD(n_components=n_comps,
                                random_state=random_state)
            X = adata_comp.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][
                adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.msg('    finished', t=True, end=' ', v=4)
        logg.msg(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)',
            v=4)
        return adata if copy else None
    else:
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca