def filter_data(data: AnnData) -> None: """ Filter data based on qc_metrics calculated in ``pg.qc_metrics``. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. Returns ------- ``None`` Update ``data`` with cells and genes after filtration. Examples -------- >>> pg.filter_data(adata) """ assert "passed_qc" in data.obs data._inplace_subset_obs(data.obs["passed_qc"].values) data._inplace_subset_var((data.var["n_cells"] > 0).values) logger.info( "After filteration, {nc} cells and {ng} genes are kept. Among {ng} genes, {nrb} genes are robust.".format( nc=data.shape[0], ng=data.shape[1], nrb=data.var["robust"].sum() ) )
def fit( adata: AnnData, root=None, leaves=None, layer: Optional[str] = None, n_map: int = 1, n_jobs: int = 1, gamma: float = 1.5, save_raw: bool = True, copy: bool = False, ): """\ Model feature expression levels as a function of tree positions. The models are fit using *mgcv* R package. Note that since adata can currently only keep the same dimensions for each of its layers. While the dataset is subsetted to keep only significant feratures, the unsubsetted dataset is kept in adata.raw (save_raw parameter). Parameters ---------- adata Annotated data matrix. root restrain the fit to a subset of the tree (in combination with leaves). leaves restrain the fit to a subset of the tree (in combination with root). layer adata layer to use for the fitting. n_map number of cell mappings from which to do the test. n_jobs number of cpu processes used to perform the test. gamma stringency of penalty. saveraw save the unsubsetted anndata to adata.raw copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` it returns subsetted or else subset (keeping only significant features) and add fields to `adata`: `.layers['fitted']` fitted features on the trajectory for all mappings. """ if any(check): idx = np.argwhere( [type(imp) == str for imp in [Rpy2, R, rstats, rmgcv, Formula]] ).min() raise Exception(np.array([Rpy2, R, rstats, rmgcv, Formula])[idx]) adata = adata.copy() if copy else adata if "signi" not in adata.var.columns: raise ValueError( "You need to run `tl.test_association` before fitting features." ) genes = adata.var_names[adata.var.signi] graph = adata.uns["graph"] tips = graph["tips"] mlsc_temp = None if leaves is not None: # weird hack to keep milestones colors saved if "milestones_colors" in adata.uns: mlsc = adata.uns["milestones_colors"].copy() mlsc_temp = mlsc.copy() dct = graph["milestones"] keys = np.array(list(dct.keys())) vals = np.array(list(dct.values())) leaves = list(map(lambda leave: dct[leave], leaves)) root = dct[root] if root is None: root = graph["root"] tips = tips[~np.isin(tips, root)] root2 = None if "root2" in graph: root2 = graph["root2"] tips = tips[~np.isin(tips, graph["root2"])] if leaves is not None: tips = leaves logg.info("fit features associated with the trajectory", reset=True, end="\n") stat_assoc = list() for m in tqdm( range(n_map), disable=n_map == 1, file=sys.stdout, desc=" multi mapping " ): if "t_old" in adata.obs.columns: df = adata.obs.copy() else: df = adata.uns["pseudotime_list"][str(m)] edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str)) ) img.add_edges(edges) subtree = pd.concat( list(map(lambda tip: getpath(img, root, tips, tip, graph, df), tips)), axis=0, ) if root2 is not None: subtree = pd.concat( [ subtree, pd.concat( list( map( lambda tip: getpath(img, root2, tips, tip, graph, df), tips, ) ), axis=0, ), ] ) subtree = subtree[["t", "branch"]] subtree["gamma"] = gamma Xgenes = get_X(adata, subtree.index, genes, layer, togenelist=True) data = list(zip([subtree] * len(Xgenes), Xgenes)) stat = ProgressParallel( n_jobs=n_jobs, total=len(data), use_tqdm=n_map == 1, file=sys.stdout, desc=" single mapping ", )(delayed(gt_fun)(data[d]) for d in range(len(data))) stat_assoc = stat_assoc + [stat] for i in range(len(stat_assoc)): stat_assoc[i] = pd.concat(stat_assoc[i], axis=1) stat_assoc[i].columns = adata.var_names[adata.var.signi] names = np.arange(len(stat_assoc)).astype(str).tolist() dictionary = dict(zip(names, stat_assoc)) if n_map == 1: fitted = dictionary["0"] else: dfs = list(dictionary.values()) fitted = reduce(lambda x, y: x.add(y, fill_value=0), dfs) / n_map if save_raw: adata.raw = adata adata._inplace_subset_obs(np.unique(dictionary["0"].index)) adata._inplace_subset_var(genes) adata.layers["fitted"] = fitted.loc[adata.obs_names, :] if mlsc_temp is not None: adata.uns["milestones_colors"] = mlsc_temp logg.info( " finished (adata subsetted to keep only fitted features!)", time=True, end=" " if settings.verbosity > 2 else "\n", ) if save_raw: logg.hint( "added\n" " .layers['fitted'], fitted features on the trajectory for all mappings.\n" " .raw, unfiltered data." ) else: logg.hint( "added\n" " .layers['fitted'], fitted features on the trajectory for all mappings." ) return adata if copy else None
def filter_cells( adata: AnnData, min_counts: int = -1, max_counts: int = -1, max_mt_ratio: int = 20, # doublet_detection: bool = False, # scrublet_kwargs: dict = { # "total_counts": None, # "sim_doublet_ratio": 2.0, # "n_neighbors": None, # "expected_doublet_rate": 0.1, # "stdev_doublet_rate": 0.02, # "random_state": 0, # }, verbose=True, ): """Filter problematic cells in an AnnData Args: adata(AnnData): The AnnData object to be pre-processed. min_counts(int): Minimum number of counts required for a cell to pass filtering. `-1` -> median(counts) - std(counts) max_counts(int): Maximum number of counts required for a cell to pass filtering. `-1` -> median(counts) + std(counts) max_mt_ratio(int): Maximum proportion of mitochondrial genes in a cell to pass filtering. verbose: (Default value = True) Returns: * Sets """ # doublet_detection(bool): Uses doublet detection instead of max counts to remove doublets # scrublet_kwargs(dict): Arguments passed to Scrublet for doublet detection # -- sparse -> array if 'ndarray' not in str(type(adata.X)): adata.X = adata.X.toarray() # -- Mitochondrial content adata.var["mt"] = adata.var_names.str.startswith("MT-") sc.pp.calculate_qc_metrics( adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True ) # -- min/max suggestion counts = adata.X.sum(axis=1) md = np.median(counts) sd = np.std(counts) if min_counts == -1: min_counts = max(0, md - sd) if max_counts == -1: max_counts = md + sd # # -- Doublet detection # if doublet_detection: # scrub = scr.Scrublet( # adata.X, # total_counts=scrublet_kwargs["total_counts"], # sim_doublet_ratio=scrublet_kwargs["sim_doublet_ratio"], # n_neighbors=scrublet_kwargs["n_neighbors"], # expected_doublet_rate=scrublet_kwargs["expected_doublet_rate"], # stdev_doublet_rate=scrublet_kwargs["stdev_doublet_rate"], # random_state=scrublet_kwargs["random_state"], # ) # ( # adata.obs["doublet_scores"], # adata.obs["predicted_doublets"], # ) = scrub.scrub_doublets() # inds1 = np.where( # (~adata.obs["predicted_doublets"].values) # & (adata.obs["total_counts"] < max_counts) # & (adata.obs["total_counts"] > min_counts) # ) # del scrub # else: inds1 = np.where( (adata.obs["total_counts"] > min_counts) & (adata.obs["total_counts"] < max_counts)) inds2 = np.where(adata.obs["pct_counts_mt"] < max_mt_ratio) if verbose: # if doublet_detection: # print(np.sum(adata.obs["predicted_doublets"]), "doublets encountered") # print(len(inds1[0]), "cells pass the doublet and counts filters.") # else: print(len(inds1[0]), "cells pass the count filter") print(len(inds2[0]), " cells pass the mt filter") ind_cells = np.intersect1d(inds1[0], inds2[0]) if verbose: print("Cells selected", len(ind_cells)) adata._inplace_subset_obs(ind_cells) gc.collect()
def filter_cells(adata: AnnData, device="cpu", p_level=None, subset=True, plot=False, copy=False): """\ Filter cells using on gene/molecule relationship. Code has been translated from pagoda2 R function gene.vs.molecule.cell.filter. Parameters ---------- adata Annotated data matrix. device Run gene and molecule counting on either `cpu` or on `gpu`. p_level Statistical confidence level for deviation from the main trend, used for cell filtering (default=min(1e-3,1/adata.shape[0])) subset if False, add a column `outlier` in adata.obs, otherwise subset the adata. plot Plot the molecule distribution and the gene/molecule dependency fit. copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` and `subset=True` it returns subsetted (removing outliers) or else add fields to `adata`: `.obs['outlier']` whether a cell is an outlier. """ adata = adata.copy() if copy else adata logg.info("Filtering cells", reset=True) X = adata.X.copy() logg.info(" obtaining gene and molecule counts") if device == "cpu": log1p_total_counts = np.log1p(np.array(X.sum(axis=1))).ravel() X.data = np.ones_like(X.data) log1p_n_genes_by_counts = np.log1p(np.array(X.sum(axis=1))).ravel() elif device == "gpu": import cupy as cp from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu X = csr_matrix_gpu(X) log1p_total_counts = cp.log1p(X.sum(axis=1)).get().ravel() X.data = cp.ones_like(X.data) log1p_n_genes_by_counts = cp.log1p(X.sum(axis=1)).get().ravel() df = pd.DataFrame( { "log1p_total_counts": log1p_total_counts, "log1p_n_genes_by_counts": log1p_n_genes_by_counts, }, index=adata.obs_names, ) logg.info(" fitting RLM") rlm_model = sm.RLM.from_formula( "log1p_n_genes_by_counts ~ log1p_total_counts", df, ).fit() p_level = min(1e-3, 1 / adata.shape[0]) if p_level is None else p_level SSE_line = ((df.log1p_n_genes_by_counts - rlm_model.predict())**2).sum() MSE = SSE_line / df.shape[0] z = t.ppf((p_level / 2, 1 - p_level / 2), df.shape[0]) se = np.zeros(df.shape[0]) get_SE(MSE, df.log1p_total_counts.values, se) pr = pd.DataFrame( { 0: rlm_model.predict(), 1: rlm_model.predict() + se * z[0], 2: rlm_model.predict() + se * z[1], }, index=adata.obs_names, ) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") outlier = (df.log1p_n_genes_by_counts < pr[1]) | (df.log1p_n_genes_by_counts > pr[2]) if plot: fig, ax = plt.subplots() idx = df.sort_values("log1p_total_counts").index ax.fill_between( df.log1p_total_counts[[idx[0], idx[-1]]], pr[1][[idx[0], idx[-1]]], pr[2][[idx[0], idx[-1]]], color="yellow", alpha=0.3, ) df.loc[~outlier].plot.scatter(x="log1p_total_counts", y="log1p_n_genes_by_counts", c="k", ax=ax, s=1) df.loc[outlier].plot.scatter(x="log1p_total_counts", y="log1p_n_genes_by_counts", c="grey", ax=ax, s=1) if subset: adata._inplace_subset_obs(adata.obs_names[~outlier]) logg.hint("subsetted adata.") else: adata.obs["outlier"] = outlier logg.hint("added \n" " .obs['outlier'], boolean column indicating outliers.") return adata if copy else None
def merge(adata: AnnData, ldata: AnnData, copy: bool = True) -> Optional[AnnData]: """Merge two annotated data matrices. Arguments --------- adata Annotated data matrix (reference data set). ldata Annotated data matrix (to be merged into adata). copy Boolean flag to manipulate original AnnData or a copy of it. Returns ------- Optional[:class:`anndata.AnnData`] Returns a :class:`~anndata.AnnData` object """ adata.var_names_make_unique() ldata.var_names_make_unique() if ("spliced" in ldata.layers.keys() and "initial_size_spliced" not in ldata.obs.keys()): set_initial_size(ldata) elif ("spliced" in adata.layers.keys() and "initial_size_spliced" not in adata.obs.keys()): set_initial_size(adata) common_obs = pd.unique(adata.obs_names.intersection(ldata.obs_names)) common_vars = pd.unique(adata.var_names.intersection(ldata.var_names)) if len(common_obs) == 0: clean_obs_names(adata) clean_obs_names(ldata) common_obs = adata.obs_names.intersection(ldata.obs_names) if copy: _adata = adata[common_obs].copy() _ldata = ldata[common_obs].copy() else: adata._inplace_subset_obs(common_obs) _adata, _ldata = adata, ldata[common_obs].copy() _adata.var_names_make_unique() _ldata.var_names_make_unique() same_vars = len(_adata.var_names) == len( _ldata.var_names) and np.all(_adata.var_names == _ldata.var_names) join_vars = len(common_vars) > 0 if join_vars and not same_vars: _adata._inplace_subset_var(common_vars) _ldata._inplace_subset_var(common_vars) for attr in _ldata.obs.keys(): if attr not in _adata.obs.keys(): _adata.obs[attr] = _ldata.obs[attr] for attr in _ldata.obsm.keys(): if attr not in _adata.obsm.keys(): _adata.obsm[attr] = _ldata.obsm[attr] for attr in _ldata.uns.keys(): if attr not in _adata.uns.keys(): _adata.uns[attr] = _ldata.uns[attr] if join_vars: for attr in _ldata.layers.keys(): if attr not in _adata.layers.keys(): _adata.layers[attr] = _ldata.layers[attr] if _adata.shape[1] == _ldata.shape[1]: same_vars = len(_adata.var_names) == len( _ldata.var_names) and np.all( _adata.var_names == _ldata.var_names) if same_vars: for attr in _ldata.var.keys(): if attr not in _adata.var.keys(): _adata.var[attr] = _ldata.var[attr] for attr in _ldata.varm.keys(): if attr not in _adata.varm.keys(): _adata.varm[attr] = _ldata.varm[attr] else: raise ValueError("Variable names are not identical.") return _adata if copy else None