def postprocess_mnnpy(adata, bdata): """ postprocessing to generate a newly functional AnnData object After running mnnpy_mnncorrect we obtain ann AnnData object bdata. Since mnn_correct automatically truncates all the genes contained in .raw to contain only the highly variable genes this function creates a new AnnData object that contains .X from bdata but .raw from AnnData (which still contains all the genes, not only the highly variable ones). Before creation of the new AnnData object the matrices are sorted according to cellbarcode so that we ensure the labelings are correct. parameters ---------- adata: the uncorrected AnnData object bdata: the batch correted AnnData object returns ------- AnnData AnnData object with adata.X containing the corrected values and .raw all of the original values """ corrected_matrix = DataFrame(data = bdata.X, index = bdata.obs_names.tolist(), columns = bdata.var_names.tolist()) corrected_matrix.sort_index(inplace=True) new_adata = AnnData(corrected_matrix.values) new_adata.obs = bdata.obs.sort_index() new_adata.var_names = bdata.var_names new_adata.obs_names = bdata.obs_names.sort_values() new_adata.var = bdata.var #need to sort raw object to match the batch corrected order raw_matrix = DataFrame(data=(adata.raw.X.todense() if scipy.sparse.issparse(adata.raw.X) else adata.raw.X), index=adata.obs_names.tolist(), columns=adata.raw.var_names.tolist()) raw_matrix.sort_index(inplace=True) #recreate raw raw = AnnData(raw_matrix.values) raw.var_names = adata.raw.var_names raw.obs_names = adata.obs_names.sort_values() raw.var = adata.raw.var #add raw back in new_adata.raw = raw #ensure that indices are preserved adata.obs_names = adata.obs.CELL adata.obs.index = adata.obs.CELL return(new_adata)
def _tcr_objs_to_anndata(tcr_objs: Collection) -> AnnData: """Convert a list of TcrCells to an AnnData object""" tcr_df = pd.DataFrame.from_records( (_process_tcr_cell(x) for x in tcr_objs), index="cell_id" ) adata = AnnData(obs=tcr_df, X=np.empty([tcr_df.shape[0], 0])) _sanitize_anndata(adata) return adata
def _make_anndata(X: np.ndarray, observation: DataFrame, variables: Optional[DataFrame] = None) -> AnnData: '''Make a scanpy AnnData object out of pieces :Param X: numpy array with biological data, e.g. expression :Param observation: annotation for biological data :Param variables: some data along second dimension of expression, e.g. genes :Return: AnnData object ''' return AnnData(X, observation, variables)
def process(self): """ A method to run `harmony` on input Data Frame """ # Harmony augmented affinity matrix logg.info('Harmony augmented affinity matrix ...', r=True) self.tp = pd.Series(index=self.data_df.index) for t in self.timepoints: cells = self.data_df.index[self.data_df.index.str.contains(t)] self.tp[cells] = t self.timepoint_connections = pd.DataFrame(columns=[0, 1]) index = 0 for i in range(len(self.timepoints) - 1): self.timepoint_connections.loc[ index, :] = self.timepoints[i:i + 2] index += 1 # compute the augmented and non-augmented affinity matrices self.aug_aff, self.aff = self.harmony.core.augmented_affinity_matrix( self.data_df, self.tp, self.timepoint_connections) # Visualization using force directed layouts self.layout = self.harmony.plot.force_directed_layout( self.aug_aff, self.data_df.index) # push outputs to a new scanpy.AnnDana from scanpy import AnnData self.harmony_adata = AnnData(self.data_df) self.harmony_adata.obsm['layout'] = np.array(self.layout) self.harmony_adata.uns['tp'] = self.tp self.harmony_adata.uns['aff'] = self.aff self.harmony_adata.uns['aug_aff'] = self.aug_aff self.harmony_adata.uns['sample_names'] = self.sample_names self.harmony_adata.uns['timepoints'] = self.timepoints self.harmony_adata.uns[ 'timepoint_connections'] = self.timepoint_connections logg.info('End of processing, start plotting.', r=True) return self.harmony_adata
def test_chain_pairing(): obs = pd.DataFrame.from_records( [ ["False", "nan", "nan", "nan", "nan", "nan"], ["True", "True", "AAAA", "BBBB", "CCCC", "DDDD"], ["True", "False", "AAAA", "BBBB", "CCCC", "DDDD"], ["True", "nan", "AAAA", "nan", "nan", "nan"], ["True", "False", "AAAA", "nan", "CCCC", "nan"], ["True", "False", "AAAA", "BBBB", "nan", "nan"], ["True", "False", "AAAA", "BBBB", "CCCC", "nan"], ["True", "False", "nan", "nan", "CCCC", "nan"], ["True", "False", "nan", "nan", "CCCC", "DDDD"], ["True", "False", "AAAA", "nan", "CCCC", "DDDD"], ], columns=[ "has_tcr", "multi_chain", "TRA_1_cdr3", "TRA_2_cdr3", "TRB_1_cdr3", "TRB_2_cdr3", ], ) adata = AnnData(obs=obs) res = st.tl.chain_pairing(adata, inplace=False) npt.assert_equal( res, [ "No TCR", "Multichain", "Two full chains", "Orphan alpha", "Single pair", "Orphan alpha", "Extra alpha", "Orphan beta", "Orphan beta", "Extra beta", ], )
def test_group_abundance(): obs = pd.DataFrame.from_records( [ ["cell1", "A", "ct1"], ["cell2", "A", "ct1"], ["cell3", "A", "ct1"], ["cell3", "A", "NaN"], ["cell4", "B", "ct1"], ["cell5", "B", "ct2"], ], columns=["cell_id", "group", "clonotype"], ).set_index("cell_id") adata = AnnData(obs=obs) # Check counts res = st.tl.group_abundance( adata, groupby="clonotype", target_col="group", fraction=False ) expected_count = pd.DataFrame.from_dict( {"ct1": {"A": 3.0, "B": 1.0}, "ct2": {"A": 0.0, "B": 1.0},}, orient="index", ) npt.assert_equal(res.values, expected_count.values) # Check fractions res = st.tl.group_abundance( adata, groupby="clonotype", target_col="group", fraction=True ) expected_frac = pd.DataFrame.from_dict( {"ct1": {"A": 0.75, "B": 0.25}, "ct2": {"A": 0.0, "B": 1.0},}, orient="index", ) npt.assert_equal(res.values, expected_frac.values) # Check swapped res = st.tl.group_abundance( adata, groupby="group", target_col="clonotype", fraction=True ) expected_frac = pd.DataFrame.from_dict( {"A": {"ct1": 1.0, "ct2": 0.0}, "B": {"ct1": 0.5, "ct2": 0.5},}, orient="index", ) npt.assert_equal(res.values, expected_frac.values)
def test_chain_qc(): obs = pd.DataFrame.from_records( [ ["False", "nan", "nan", "nan", "nan", "nan"], ["True", "True", "TRA", "TRB", "TRA", "TRB"], # multichain takes precedencee over ambiguous ["True", "True", "TRA", "IGH", "nan", "nan"], ["True", "False", "TRA", "TRB", "nan", "nan"], ["True", "False", "TRA", "TRB", "TRA", "nan"], ["True", "False", "TRA", "TRB", "nan", "TRB"], ["True", "False", "TRA", "TRB", "TRA", "TRB"], ["True", "False", "IGK", "IGH", "nan", "nan"], ["True", "False", "IGL", "IGH", "IGL", "IGH"], ["True", "False", "IGL", "IGH", "IGK", "IGH"], ["True", "False", "nan", "IGH", "nan", "IGH"], ["True", "False", "TRA", "TRB", "TRG", "TRB"], ["True", "False", "IGK", "TRB", "nan", "nan"], ["True", "False", "TRA", "nan", "nan", "nan"], ["True", "False", "IGL", "nan", "nan", "nan"], ["True", "False", "nan", "TRD", "nan", "nan"], ], columns=[ "has_ir", "multi_chain", "IR_VJ_1_locus", "IR_VDJ_1_locus", "IR_VJ_2_locus", "IR_VDJ_2_locus", ], ) # fake chains for chain, chain_number in itertools.product(["VJ", "VDJ"], ["1", "2"]): obs[f"IR_{chain}_{chain_number}_junction_aa"] = [ "AAA" if x != "nan" else "nan" for x in obs[f"IR_{chain}_{chain_number}_locus"] ] adata = AnnData(obs=obs) adata.uns["scirpy_version"] = "0.7" ir.tl.chain_qc(adata, key_added=("rec_type", "rec_subtype", "ch_pairing")) npt.assert_equal( adata.obs["rec_type"], np.array([ "no IR", "multichain", "multichain", "TCR", "TCR", "TCR", "TCR", "BCR", "BCR", "BCR", "BCR", "TCR", "ambiguous", "TCR", "BCR", "TCR", ]), ) npt.assert_equal( adata.obs["rec_subtype"], np.array([ "no IR", "multichain", # "multichain", "TRA+TRB", "TRA+TRB", "TRA+TRB", "TRA+TRB", "IGH+IGK", "IGH+IGL", "ambiguous", "IGH", "ambiguous", "ambiguous", "TRA+TRB", "IGH+IGL", "TRG+TRD", ]), )
def test_chain_pairing(): obs = pd.DataFrame.from_records( [ [ "False", "nan", "nan", "nan", "nan", "nan", "nan", "nan", "nan", "nan" ], [ "True", "True", "AA", "BB", "CC", "DD", "TRA", "TRA", "TRA", "TRB" ], [ "True", "False", "AA", "BB", "CC", "DD", "TRA", "TRA", "TRB", "TRB" ], [ "True", "False", "AA", "nan", "nan", "nan", "TRA", "nan", "nan", "nan" ], [ "True", "False", "AA", "nan", "CC", "nan", "TRA", "nan", "TRB", "nan" ], [ "True", "False", "AA", "BB", "nan", "nan", "TRA", "TRA", "nan", "nan" ], [ "True", "False", "AA", "BB", "CC", "nan", "TRA", "TRA", "TRB", "TRB" ], [ "True", "False", "nan", "nan", "CC", "nan", "nan", "nan", "TRB", "nan" ], [ "True", "False", "nan", "nan", "CC", "DD", "nan", "nan", "TRB", "TRB" ], [ "True", "False", "AA", "nan", "CC", "DD", "TRA", "nan", "TRB", "TRB" ], [ "True", "False", "AA", "nan", "CC", "DD", "TRA", "nan", "TRB", "IGH" ], ], columns=[ "has_ir", "multi_chain", "IR_VJ_1_junction_aa", "IR_VJ_2_junction_aa", "IR_VDJ_1_junction_aa", "IR_VDJ_2_junction_aa", "IR_VJ_1_locus", "IR_VJ_2_locus", "IR_VDJ_1_locus", "IR_VDJ_2_locus", ], ) adata = AnnData(obs=obs) adata.uns["scirpy_version"] = "0.7" res = ir.tl.chain_pairing(adata, inplace=False) npt.assert_equal( res, [ "no IR", "multichain", "two full chains", "orphan VJ", "single pair", "orphan VJ", "extra VJ", "orphan VDJ", "orphan VDJ", "extra VDJ", "ambiguous", ], )
def mast( adata: AnnData, *, groupby: str, groups: Union[Literal["all"], Sequence[str]], cofactors: Sequence[str] = None, layer: Optional[str] = None, n_cores_per_job: int = 4, n_jobs: int = 4, ): """ Perform DE analysis using edgeR. Requires that an R installation and the following packages are available MAST BiocParallel Install them with `conda install bioconductor-mast bioconductor-biocparallel`. Parameters ---------- adata annotated data matrix. X must contain normalized and log-transformed values. groupby The column in adata.obs to test for DE cofactors Additional columns to include into the model layer layer in adata that contains raw counts. If None, use `X`. subsample_disp Subsample cells to this nubmer during estimation of overdispersion. n_cores_per_job Number of cores to run per job (including BLAS parallelization) n_jobs Number of tests to run in parallel. """ try: from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter from rpy2 import robjects as ro import anndata2ri except ImportError: raise ImportError( "MAST requires rpy2 and anndata2ri to be installed. ") try: mast = importr("MAST") bcparallel = importr("BiocParallel") except ImportError: raise ImportError( "MAST requires a valid R installation with the following packages: " "MAST, BiocParallel") bcparallel.register(bcparallel.MulticoreParam(n_jobs)) logging.info("Preparing AnnData") tmp_adata = AnnData( X=adata.X if layer is None else adata.layers[layer], obs=adata.obs, var=adata.var, ) tmp_adata.obs.columns = _make_names(tmp_adata.obs.columns) tmp_adata.obs[groupby] = _make_names(tmp_adata.obs[groupby]) contrasts = [] for group in tmp_adata.obs[groupby].unique(): contrasts.append(f"is_group_{group}") tmp_adata.obs[f"is_group_{group}"] = tmp_adata.obs[groupby] == group logging.info("Preparing R objects") with localconverter(anndata2ri.converter): sce = ro.conversion.py2rpy(tmp_adata) sca = mast.SceToSingleCellAssay(sce) groupby = _make_names([groupby])[0] cofactor_formula = ("" if cofactors is None else "+ " + " + ".join(_make_names(cofactors))) logging.info("Running MAST") ro.globalenv["cpus_per_thread"] = n_cores_per_job ro.globalenv["contrasts"] = contrasts ro.globalenv["cofactor_formula"] = cofactor_formula ro.globalenv["sca"] = sca ro.r(""" library(dplyr) de_res = bplapply(contrasts, function(model_col) { op = options(mc.cores=cpus_per_thread) on.exit(options(op)) contrast_to_test = paste0(model_col, "TRUE") fit = zlm(as.formula(paste0("~", model_col, cofactor_formula)), sca) res = summary(fit, doLRT=contrast_to_test)$datatable merge( res[contrast==contrast_to_test & component=='H', .(primerid, `Pr(>Chisq)`)], #P-vals res[contrast==contrast_to_test & component=='logFC', .(primerid, coef)], by='primerid' ) %>% mutate(comparison=model_col) }) %>% bind_rows() """) with localconverter(ro.default_converter + pandas2ri.converter): de_res = ro.conversion.rpy2py(ro.globalenv["de_res"]) de_res["comparison"] = de_res["comparison"].str.replace("is_group_", "") return de_res