Example #1
0
def filter_data(data: AnnData) -> None:
    """ Filter data based on qc_metrics calculated in ``pg.qc_metrics``.

    Parameters
    ----------
    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    Returns
    -------
    ``None``

    Update ``data`` with cells and genes after filtration.

    Examples
    --------
    >>> pg.filter_data(adata)
    """

    assert "passed_qc" in data.obs
    data._inplace_subset_obs(data.obs["passed_qc"].values)
    data._inplace_subset_var((data.var["n_cells"] > 0).values)
    logger.info(
        "After filteration, {nc} cells and {ng} genes are kept. Among {ng} genes, {nrb} genes are robust.".format(
            nc=data.shape[0], ng=data.shape[1], nrb=data.var["robust"].sum()
        )
    )
Example #2
0
def fit(
    adata: AnnData,
    root=None,
    leaves=None,
    layer: Optional[str] = None,
    n_map: int = 1,
    n_jobs: int = 1,
    gamma: float = 1.5,
    save_raw: bool = True,
    copy: bool = False,
):

    """\
    Model feature expression levels as a function of tree positions.

    The models are fit using *mgcv* R package. Note that since adata can currently only keep the
    same dimensions for each of its layers. While the dataset is subsetted to keep only significant
    feratures, the unsubsetted dataset is kept in adata.raw (save_raw parameter).


    Parameters
    ----------
    adata
        Annotated data matrix.
    root
        restrain the fit to a subset of the tree (in combination with leaves).
    leaves
        restrain the fit to a subset of the tree (in combination with root).
    layer
        adata layer to use for the fitting.
    n_map
        number of cell mappings from which to do the test.
    n_jobs
        number of cpu processes used to perform the test.
    gamma
        stringency of penalty.
    saveraw
        save the unsubsetted anndata to adata.raw
    copy
        Return a copy instead of writing to adata.
    Returns
    -------

    adata : anndata.AnnData
        if `copy=True` it returns subsetted or else subset (keeping only
        significant features) and add fields to `adata`:

        `.layers['fitted']`
            fitted features on the trajectory for all mappings.

    """

    if any(check):
        idx = np.argwhere(
            [type(imp) == str for imp in [Rpy2, R, rstats, rmgcv, Formula]]
        ).min()
        raise Exception(np.array([Rpy2, R, rstats, rmgcv, Formula])[idx])

    adata = adata.copy() if copy else adata

    if "signi" not in adata.var.columns:
        raise ValueError(
            "You need to run `tl.test_association` before fitting features."
        )

    genes = adata.var_names[adata.var.signi]

    graph = adata.uns["graph"]
    tips = graph["tips"]

    mlsc_temp = None
    if leaves is not None:
        # weird hack to keep milestones colors saved
        if "milestones_colors" in adata.uns:
            mlsc = adata.uns["milestones_colors"].copy()
            mlsc_temp = mlsc.copy()
        dct = graph["milestones"]
        keys = np.array(list(dct.keys()))
        vals = np.array(list(dct.values()))

        leaves = list(map(lambda leave: dct[leave], leaves))
        root = dct[root]

    if root is None:
        root = graph["root"]
        tips = tips[~np.isin(tips, root)]
    root2 = None
    if "root2" in graph:
        root2 = graph["root2"]
        tips = tips[~np.isin(tips, graph["root2"])]

    if leaves is not None:
        tips = leaves

    logg.info("fit features associated with the trajectory", reset=True, end="\n")

    stat_assoc = list()

    for m in tqdm(
        range(n_map), disable=n_map == 1, file=sys.stdout, desc="    multi mapping "
    ):
        if "t_old" in adata.obs.columns:
            df = adata.obs.copy()
        else:
            df = adata.uns["pseudotime_list"][str(m)]
        edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values
        img = igraph.Graph()
        img.add_vertices(
            np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))
        )
        img.add_edges(edges)

        subtree = pd.concat(
            list(map(lambda tip: getpath(img, root, tips, tip, graph, df), tips)),
            axis=0,
        )
        if root2 is not None:
            subtree = pd.concat(
                [
                    subtree,
                    pd.concat(
                        list(
                            map(
                                lambda tip: getpath(img, root2, tips, tip, graph, df),
                                tips,
                            )
                        ),
                        axis=0,
                    ),
                ]
            )
        subtree = subtree[["t", "branch"]]
        subtree["gamma"] = gamma

        Xgenes = get_X(adata, subtree.index, genes, layer, togenelist=True)

        data = list(zip([subtree] * len(Xgenes), Xgenes))

        stat = ProgressParallel(
            n_jobs=n_jobs,
            total=len(data),
            use_tqdm=n_map == 1,
            file=sys.stdout,
            desc="    single mapping ",
        )(delayed(gt_fun)(data[d]) for d in range(len(data)))

        stat_assoc = stat_assoc + [stat]

    for i in range(len(stat_assoc)):
        stat_assoc[i] = pd.concat(stat_assoc[i], axis=1)
        stat_assoc[i].columns = adata.var_names[adata.var.signi]

    names = np.arange(len(stat_assoc)).astype(str).tolist()
    dictionary = dict(zip(names, stat_assoc))

    if n_map == 1:
        fitted = dictionary["0"]
    else:
        dfs = list(dictionary.values())
        fitted = reduce(lambda x, y: x.add(y, fill_value=0), dfs) / n_map

    if save_raw:
        adata.raw = adata

    adata._inplace_subset_obs(np.unique(dictionary["0"].index))
    adata._inplace_subset_var(genes)

    adata.layers["fitted"] = fitted.loc[adata.obs_names, :]

    if mlsc_temp is not None:
        adata.uns["milestones_colors"] = mlsc_temp

    logg.info(
        "    finished (adata subsetted to keep only fitted features!)",
        time=True,
        end=" " if settings.verbosity > 2 else "\n",
    )

    if save_raw:
        logg.hint(
            "added\n"
            "    .layers['fitted'], fitted features on the trajectory for all mappings.\n"
            "    .raw, unfiltered data."
        )
    else:
        logg.hint(
            "added\n"
            "    .layers['fitted'], fitted features on the trajectory for all mappings."
        )

    return adata if copy else None
Example #3
0
def filter_cells(
    adata: AnnData,
    min_counts: int = -1,
    max_counts: int = -1,
    max_mt_ratio: int = 20,
    # doublet_detection: bool = False,
    # scrublet_kwargs: dict = {
    #     "total_counts": None,
    #     "sim_doublet_ratio": 2.0,
    #     "n_neighbors": None,
    #     "expected_doublet_rate": 0.1,
    #     "stdev_doublet_rate": 0.02,
    #     "random_state": 0,
    # },
    verbose=True,
):
    """Filter problematic cells in an AnnData

    Args:
      adata(AnnData): The AnnData object to be pre-processed.
      min_counts(int): Minimum number of counts required for a cell to pass filtering.
      `-1` -> median(counts) - std(counts)
      max_counts(int): Maximum number of counts required for a cell to pass filtering.
      `-1` -> median(counts) + std(counts)
      max_mt_ratio(int): Maximum proportion of mitochondrial genes in a cell to pass
      filtering.
      verbose: (Default value = True)

    Returns:
    * Sets
    """
    # doublet_detection(bool): Uses doublet detection instead of max counts to remove doublets
    # scrublet_kwargs(dict): Arguments passed to Scrublet for doublet detection

    # -- sparse -> array
    if 'ndarray' not in str(type(adata.X)):
        adata.X = adata.X.toarray()

    # -- Mitochondrial content
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(
        adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
    )

    # -- min/max suggestion
    counts = adata.X.sum(axis=1)
    md = np.median(counts)
    sd = np.std(counts)
    if min_counts == -1:
        min_counts = max(0, md - sd)
    if max_counts == -1:
        max_counts = md + sd
    # # -- Doublet detection
    # if doublet_detection:
    #     scrub = scr.Scrublet(
    #         adata.X,
    #         total_counts=scrublet_kwargs["total_counts"],
    #         sim_doublet_ratio=scrublet_kwargs["sim_doublet_ratio"],
    #         n_neighbors=scrublet_kwargs["n_neighbors"],
    #         expected_doublet_rate=scrublet_kwargs["expected_doublet_rate"],
    #         stdev_doublet_rate=scrublet_kwargs["stdev_doublet_rate"],
    #         random_state=scrublet_kwargs["random_state"],
    #     )
    #     (
    #         adata.obs["doublet_scores"],
    #         adata.obs["predicted_doublets"],
    #     ) = scrub.scrub_doublets()
    #     inds1 = np.where(
    #         (~adata.obs["predicted_doublets"].values)
    #         & (adata.obs["total_counts"] < max_counts)
    #         & (adata.obs["total_counts"] > min_counts)
    #     )
    #     del scrub
    # else:
    inds1 = np.where(
        (adata.obs["total_counts"] > min_counts) &
        (adata.obs["total_counts"] < max_counts))
    inds2 = np.where(adata.obs["pct_counts_mt"] < max_mt_ratio)
    if verbose:
        # if doublet_detection:
        #     print(np.sum(adata.obs["predicted_doublets"]), "doublets encountered")
        #     print(len(inds1[0]), "cells pass the doublet and counts filters.")
        # else:
        print(len(inds1[0]), "cells pass the count filter")
        print(len(inds2[0]), " cells pass the mt filter")
    ind_cells = np.intersect1d(inds1[0], inds2[0])
    if verbose:
        print("Cells selected", len(ind_cells))
    adata._inplace_subset_obs(ind_cells)
    gc.collect()
Example #4
0
def filter_cells(adata: AnnData,
                 device="cpu",
                 p_level=None,
                 subset=True,
                 plot=False,
                 copy=False):
    """\
    Filter cells using on gene/molecule relationship.

    Code has been translated from pagoda2 R function gene.vs.molecule.cell.filter.


    Parameters
    ----------
    adata
        Annotated data matrix.
    device
        Run gene and molecule counting on either `cpu` or on `gpu`.
    p_level
        Statistical confidence level for deviation from the main trend, used for cell filtering (default=min(1e-3,1/adata.shape[0]))
    subset
        if False, add a column `outlier` in adata.obs, otherwise subset the adata.
    plot
        Plot the molecule distribution and the gene/molecule dependency fit.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------

    adata : anndata.AnnData
        if `copy=True` and `subset=True` it returns subsetted (removing outliers) or else add fields to `adata`:

        `.obs['outlier']`
            whether a cell is an outlier.

    """

    adata = adata.copy() if copy else adata

    logg.info("Filtering cells", reset=True)
    X = adata.X.copy()

    logg.info("    obtaining gene and molecule counts")
    if device == "cpu":
        log1p_total_counts = np.log1p(np.array(X.sum(axis=1))).ravel()
        X.data = np.ones_like(X.data)
        log1p_n_genes_by_counts = np.log1p(np.array(X.sum(axis=1))).ravel()
    elif device == "gpu":
        import cupy as cp
        from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu

        X = csr_matrix_gpu(X)
        log1p_total_counts = cp.log1p(X.sum(axis=1)).get().ravel()
        X.data = cp.ones_like(X.data)
        log1p_n_genes_by_counts = cp.log1p(X.sum(axis=1)).get().ravel()

    df = pd.DataFrame(
        {
            "log1p_total_counts": log1p_total_counts,
            "log1p_n_genes_by_counts": log1p_n_genes_by_counts,
        },
        index=adata.obs_names,
    )

    logg.info("    fitting RLM")

    rlm_model = sm.RLM.from_formula(
        "log1p_n_genes_by_counts ~ log1p_total_counts",
        df,
    ).fit()

    p_level = min(1e-3, 1 / adata.shape[0]) if p_level is None else p_level

    SSE_line = ((df.log1p_n_genes_by_counts - rlm_model.predict())**2).sum()
    MSE = SSE_line / df.shape[0]
    z = t.ppf((p_level / 2, 1 - p_level / 2), df.shape[0])

    se = np.zeros(df.shape[0])
    get_SE(MSE, df.log1p_total_counts.values, se)
    pr = pd.DataFrame(
        {
            0: rlm_model.predict(),
            1: rlm_model.predict() + se * z[0],
            2: rlm_model.predict() + se * z[1],
        },
        index=adata.obs_names,
    )

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")

    outlier = (df.log1p_n_genes_by_counts <
               pr[1]) | (df.log1p_n_genes_by_counts > pr[2])

    if plot:
        fig, ax = plt.subplots()
        idx = df.sort_values("log1p_total_counts").index
        ax.fill_between(
            df.log1p_total_counts[[idx[0], idx[-1]]],
            pr[1][[idx[0], idx[-1]]],
            pr[2][[idx[0], idx[-1]]],
            color="yellow",
            alpha=0.3,
        )
        df.loc[~outlier].plot.scatter(x="log1p_total_counts",
                                      y="log1p_n_genes_by_counts",
                                      c="k",
                                      ax=ax,
                                      s=1)
        df.loc[outlier].plot.scatter(x="log1p_total_counts",
                                     y="log1p_n_genes_by_counts",
                                     c="grey",
                                     ax=ax,
                                     s=1)

    if subset:
        adata._inplace_subset_obs(adata.obs_names[~outlier])
        logg.hint("subsetted adata.")

    else:
        adata.obs["outlier"] = outlier
        logg.hint("added \n"
                  "    .obs['outlier'], boolean column indicating outliers.")

    return adata if copy else None
Example #5
0
def merge(adata: AnnData,
          ldata: AnnData,
          copy: bool = True) -> Optional[AnnData]:
    """Merge two annotated data matrices.

    Arguments
    ---------
    adata
        Annotated data matrix (reference data set).
    ldata
        Annotated data matrix (to be merged into adata).
    copy
        Boolean flag to manipulate original AnnData or a copy of it.

    Returns
    -------
    Optional[:class:`anndata.AnnData`]
        Returns a :class:`~anndata.AnnData` object
    """

    adata.var_names_make_unique()
    ldata.var_names_make_unique()

    if ("spliced" in ldata.layers.keys()
            and "initial_size_spliced" not in ldata.obs.keys()):
        set_initial_size(ldata)
    elif ("spliced" in adata.layers.keys()
          and "initial_size_spliced" not in adata.obs.keys()):
        set_initial_size(adata)

    common_obs = pd.unique(adata.obs_names.intersection(ldata.obs_names))
    common_vars = pd.unique(adata.var_names.intersection(ldata.var_names))

    if len(common_obs) == 0:
        clean_obs_names(adata)
        clean_obs_names(ldata)
        common_obs = adata.obs_names.intersection(ldata.obs_names)

    if copy:
        _adata = adata[common_obs].copy()
        _ldata = ldata[common_obs].copy()
    else:
        adata._inplace_subset_obs(common_obs)
        _adata, _ldata = adata, ldata[common_obs].copy()

    _adata.var_names_make_unique()
    _ldata.var_names_make_unique()

    same_vars = len(_adata.var_names) == len(
        _ldata.var_names) and np.all(_adata.var_names == _ldata.var_names)
    join_vars = len(common_vars) > 0

    if join_vars and not same_vars:
        _adata._inplace_subset_var(common_vars)
        _ldata._inplace_subset_var(common_vars)

    for attr in _ldata.obs.keys():
        if attr not in _adata.obs.keys():
            _adata.obs[attr] = _ldata.obs[attr]
    for attr in _ldata.obsm.keys():
        if attr not in _adata.obsm.keys():
            _adata.obsm[attr] = _ldata.obsm[attr]
    for attr in _ldata.uns.keys():
        if attr not in _adata.uns.keys():
            _adata.uns[attr] = _ldata.uns[attr]
    if join_vars:
        for attr in _ldata.layers.keys():
            if attr not in _adata.layers.keys():
                _adata.layers[attr] = _ldata.layers[attr]

        if _adata.shape[1] == _ldata.shape[1]:
            same_vars = len(_adata.var_names) == len(
                _ldata.var_names) and np.all(
                    _adata.var_names == _ldata.var_names)
            if same_vars:
                for attr in _ldata.var.keys():
                    if attr not in _adata.var.keys():
                        _adata.var[attr] = _ldata.var[attr]
                for attr in _ldata.varm.keys():
                    if attr not in _adata.varm.keys():
                        _adata.varm[attr] = _ldata.varm[attr]
            else:
                raise ValueError("Variable names are not identical.")

    return _adata if copy else None