def _prepare_data(self, n_cells: int, n_genes: int, noise_model: str): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) num_non_de = n_genes // 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.a_var[1, :num_non_de] = 0 sim.b_var[1, :num_non_de] = 0 self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() return sim
def test(self): """ Check that factors that are numeric receive the correct number of coefficients. :return: """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=2000, num_features=2) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() sample_description = sim.sample_description sample_description["numeric1"] = np.random.random(size=sim.nobs) sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( data=sim.input_data, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", factor_loc_totest="condition", as_numeric=["numeric1", "numeric2"], training_strategy="DEFAULT") # Check that number of coefficients is correct. assert test.model_estim.a_var.shape[0] == 4 return True
def get_simulator(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.glm_nb import Simulator else: raise ValueError("noise_model not recognized") return Simulator(num_observations=1000, num_features=50)
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() return sim
def _test_null_distribution_wald(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) test = de.test.wald(data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", batch_size=500, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64") _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def _test_residuals_fit(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.wald() (multivariate mode) generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) res = de.fit.residuals(data=sim.input_data, sample_description=random_sample_description, formula_loc="~ 1 + condition + batch", noise_model=noise_model) return True
def test_forfatal_functions(self): """ Test if de.test.continuous() DifferentialExpressionTestSingle object functions work fine. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) num_observations = 10 num_features = 2 sim = Simulator(num_observations=num_observations, num_features=num_features) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ "pseudotime": np.random.random(size=sim.num_observations), "batch": np.random.randint(2, size=sim.num_observations) }) test = de.test.continuous_1d( data=sim.X, continuous="pseudotime", df=3, formula_loc="~ 1 + pseudotime + batch", formula_scale="~ 1", factor_loc_totest="pseudotime", test="wald", sample_description=random_sample_description, quick_scale=True, batch_size=None, training_strategy="DEFAULT", dtype="float64") summary = test.summary() ids = test.gene_ids # 1. Test all additional functions which depend on model computation: # 1.1. Only continuous model: temp = test.log_fold_change(genes=ids, nonnumeric=False) temp = test.max(genes=ids, nonnumeric=False) temp = test.min(genes=ids, nonnumeric=False) temp = test.argmax(genes=ids, nonnumeric=False) temp = test.argmin(genes=ids, nonnumeric=False) temp = test.summary(nonnumeric=False) # 1.2. Full model: temp = test.log_fold_change(genes=ids, nonnumeric=True) temp = test.max(genes=ids, nonnumeric=True) temp = test.min(genes=ids, nonnumeric=True) temp = test.argmax(genes=ids, nonnumeric=True) temp = test.argmin(genes=ids, nonnumeric=True) temp = test.summary(nonnumeric=True) return True
def _test_null_distribution_rank(self, n_cells: int, n_genes: int): """ Test if de.test.rank_test() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ from batchglm.api.models.glm_norm import Simulator sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.rank_test(data=sim.input_data, sample_description=random_sample_description, grouping="condition") _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of rank_test(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def test_rank_test_zero_variance(self): """ Test if rank test works if it is given genes with zero variance. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() sim.input_data.x[:, 0] = 0 sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.rank_test(data=sim.input_data, sample_description=random_sample_description, grouping="condition", is_sig_zerovar=True) assert np.isnan(test.pval[0]) and test.pval[1] == 1, \ "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ (test.pval[0], test.pval[1]) return True
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params( rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape))) sim.generate_data() return sim
def simulate(self, n_cells: int = 20, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.num_observations)}) return sim.X, random_sample_description
def test_null_distribution_wald_constrained(self, n_genes: int = 100): """ Test if de.wald() with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) n_cells = 2000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: sample_description = pd.DataFrame({ "cond": ["cond" + str(i // 1000) for i in range(n_cells)], "batch": ["batch" + str(i // 500) for i in range(n_cells)] }) # Build constraints: dmat_loc, constraints_loc = de.utils.constraint_matrix_from_dict( sample_description=sample_description, formula="~1+cond+batch", constraints={"batch": "cond"}, dims=["design_loc_params", "loc_params"]) dmat_scale, constraints_scale = de.utils.constraint_matrix_from_dict( sample_description=sample_description, formula="~1+cond+batch", constraints={"batch": "cond"}, dims=["design_scale_params", "scale_params"]) test = de.test.wald(data=sim.x, dmat_loc=dmat_loc, dmat_scale=dmat_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["cond[T.cond1]"]) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def _prepate_data(self, n_cells: int, n_genes: int, n_groups: int): if self.noise_model == "nb": from batchglm.api.models.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape) rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape) elif self.noise_model == "norm" or self.noise_model is None: from batchglm.api.models.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame( {"condition": np.random.randint(n_groups, size=sim.nobs)}) return sim, random_sample_description
def simulate(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.glm_nb import Simulator else: raise ValueError("noise_model not recognized") num_observations = 500 sim = Simulator(num_observations=num_observations, num_features=4) sim.generate_sample_description(num_conditions=2, num_batches=2) sim.generate() self.sim = sim
def test_forfatal_from_string(self): """ Test if _from_string interface is working. n_cells is constant as the design matrix and constraints depend on it. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) n_cells = 2000 n_genes = 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 6]) dmat[:, 0] = 1 dmat[:500, 1] = 1 # bio rep 1 dmat[500:1000, 2] = 1 # bio rep 2 dmat[1000:1500, 3] = 1 # bio rep 3 dmat[1500:2000, 4] = 1 # bio rep 4 dmat[1000:2000, 5] = 1 # condition effect coefficient_names = [ 'intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'treatment1' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc = de.utils.design_matrix(dmat=dmat_est) dmat_est_scale = de.utils.design_matrix(dmat=dmat_est) # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc, constraints=["bio1+bio2=0", "bio3+bio4=0"]) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale, constraints=["bio1+bio2=0", "bio3+bio4=0"]) test = de.test.wald(data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"]) _ = test.summary()
def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): """ Test if de.test_wald_loc() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distriubution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ "condition": np.random.randint(n_groups, size=sim.num_observations) }) test = de.test.versus_rest( data=sim.X, grouping="condition", test="wald", noise_model="nb", sample_description=random_sample_description, batch_size=500, training_strategy="DEFAULT", dtype="float64") summary = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): """ Test if de.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate() sample_description = pd.DataFrame({ "covar1": np.random.randint(2, size=sim.nobs), "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( data=sim.x, parts="cond", sample_description=sample_description ) det = partition.lrt( full_formula_loc="~ 1 + covar1", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", training_strategy="DEFAULT", dtype="float64" ) _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info('KS-test pvalue for null model match of lrt(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) return True
def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100): """ Test if de.test.continuous() generates a uniform p-value distribution in the wald test if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distriubution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.INFO) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"pseudotime": np.random.random(size=sim.num_observations)}) test = de.test.continuous_1d( data=sim.X, continuous="pseudotime", df=3, formula_loc="~ 1 + pseudotime", formula_scale="~ 1", factor_loc_totest="pseudotime", test="wald", sample_description=random_sample_description, quick_scale=True, batch_size=None, training_strategy="DEFAULT", dtype="float64") summary = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def test_null_distribution_z_lazy(self, n_cells: int = 2000, n_genes: int = 100): """ Test if de.pairwise() generates a uniform p-value distribution for lazy z-tests if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distriubution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(4, size=sim.num_observations)}) test = de.test.pairwise(data=sim.X, grouping="condition", test='z-test', lazy=True, noise_model="nb", pval_correction="global", quick_scale=True, sample_description=random_sample_description, dtype="float64") # Compare p-value distribution under null model against uniform distribution. pvals = test.pval_pairs(groups0=0, groups1=1) pval_h0 = stats.kstest(pvals.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def test_forfatal_from_dict(self): """ Test if dictionary-based constraint interface is working. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) n_cells = 2000 n_genes = 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: sample_description = pd.DataFrame({ "cond": ["cond" + str(i // 1000) for i in range(n_cells)], "batch": ["batch" + str(i // 500) for i in range(n_cells)] }) # Build constraints: dmat_loc, constraints_loc = de.utils.constraint_matrix_from_dict( sample_description=sample_description, formula="~1+cond+batch", constraints={"batch": "cond"}, dims=["design_loc_params", "loc_params"]) dmat_scale, constraints_scale = de.utils.constraint_matrix_from_dict( sample_description=sample_description, formula="~1+cond+batch", constraints={"batch": "cond"}, dims=["design_scale_params", "scale_params"]) test = de.test.wald(data=sim.x, dmat_loc=dmat_loc, dmat_scale=dmat_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["cond[T.cond1]"]) _ = test.summary()
def test_sparse_anndata(self, n_cells: int = 2000, n_genes: int = 100): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.num_observations)}) adata = anndata.AnnData(scipy.sparse.csr_matrix(sim.X.values)) # X = adata.X test = de.test.wald(data=adata, factor_loc_totest="condition", formula="~ 1 + condition", sample_description=random_sample_description, quick_scale=True, training_strategy="DEFAULT", dtype="float64") summary = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def test_for_fatal(self): """ """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=50, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate() test = de.test.wald(data=sim.X, factor_loc_totest="condition", formula_loc="~ 1 + condition", sample_description=sim.sample_description, gene_names=[str(x) for x in range(sim.X.shape[1])], training_strategy="DEFAULT", dtype="float64") # Set up reference gene sets. rs = de.enrich.RefSets() rs.add(id="set1", source="manual", gene_ids=["1", "3"]) rs.add(id="set2", source="manual", gene_ids=["5", "6"]) for i in [True, False]: for j in [True, False]: enrich_test_i = de.enrich.test( ref=rs, det=test, threshold=0.05, incl_all_zero=i, clean_ref=j, ) _ = enrich_test_i.summary() _ = enrich_test_i.significant_set_ids() _ = enrich_test_i.significant_sets() _ = enrich_test_i.set_summary(id="set1") return True
def test_t_test_zero_variance(self, n_cells: int = 2000, n_genes: int = 100): """ Test if de.t_test() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() sim.data.X[:, 0] = np.exp(sim.a)[0, 0] random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.num_observations)}) test = de.test.t_test(data=sim.X, grouping="condition", sample_description=random_sample_description) # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue print('KS-test pvalue for null model match of t_test(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return pval_h0
def test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100): """ Test if de.wald() with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) n_cells = 12000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 14]) dmat[:, 0] = 1 dmat[6000:12000, 1] = 1 # condition effect dmat[:1000, 2] = 1 # bio rep 1 - treated 1 dmat[1000:3000, 3] = 1 # bio rep 2 - treated 2 dmat[3000:5000, 4] = 1 # bio rep 3 - treated 3 dmat[5000:6000, 5] = 1 # bio rep 4 - treated 4 dmat[6000:7000, 6] = 1 # bio rep 5 - untreated 1 dmat[7000:9000, 7] = 1 # bio rep 6 - untreated 2 dmat[9000:11000, 8] = 1 # bio rep 7 - untreated 3 dmat[11000:12000, 9] = 1 # bio rep 8 - untreated 4 dmat[1000:2000, 10] = 1 # tech rep 1 dmat[7000:8000, 10] = 1 # tech rep 1 dmat[2000:3000, 11] = 1 # tech rep 2 dmat[8000:9000, 11] = 1 # tech rep 2 dmat[3000:4000, 12] = 1 # tech rep 3 dmat[9000:10000, 12] = 1 # tech rep 3 dmat[4000:5000, 13] = 1 # tech rep 4 dmat[10000:11000, 13] = 1 # tech rep 4 coefficient_names = [ 'intercept', 'treatment1', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6', 'bio7', 'bio8', 'tech1', 'tech2', 'tech3', 'tech4' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc = de.test.design_matrix(dmat=dmat_est) dmat_est_scale = de.test.design_matrix(dmat=dmat_est.iloc[:, [0]]) # Build constraints: constraints_loc = de.utils.data_utils.build_equality_constraints_string( dmat=dmat_est_loc, constraints=[ "bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0", "bio7+bio8=0", "tech1+tech2=0", "tech3+tech4=0" ], dims=["design_loc_params", "loc_params"]) constraints_scale = None test = de.test.wald(data=sim.X, dmat_loc=dmat_est_loc.data_vars['design'], dmat_scale=dmat_est_scale.data_vars['design'], init_a="standard", init_b="standard", constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"], training_strategy="DEFAULT", quick_scale=False, dtype="float64") summary = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def test_null_distribution_wald_multi_constrained_2layer( self, n_genes: int = 50): """ Test if de.wald() for multiple coefficients with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) n_cells = 3000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 9]) dmat[:, 0] = 1 dmat[:500, 1] = 1 # bio rep 1 dmat[500:1000, 2] = 1 # bio rep 2 dmat[1000:1500, 3] = 1 # bio rep 3 dmat[1500:2000, 4] = 1 # bio rep 4 dmat[2000:2500, 5] = 1 # bio rep 5 dmat[2500:3000, 6] = 1 # bio rep 6 dmat[1000:2000, 7] = 1 # condition effect 1 dmat[2000:3000, 8] = 1 # condition effect 2 coefficient_names = [ 'intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6', 'treatment1', 'treatment2' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc = de.utils.design_matrix(dmat=dmat_est) dmat_est_scale = de.utils.design_matrix(dmat=dmat_est) # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc, constraints=["bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0"]) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale, constraints=["bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0"]) test = de.test.wald(data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1", "treatment2"]) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % pval_h0 return True
def test_ztest_de(self, n_cells: int = 2000, n_genes: int = 500): """ Test if de.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distriubution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) num_non_de = n_genes // 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) # simulate: coefficients ~ log(N(1, 0.5)). # re-sample if N(1, 0.5) <= 0 sim.generate_params(rand_fn=lambda shape: 1 + stats.truncnorm.rvs( -1 / 0.5, np.infty, scale=0.5, size=shape)) sim.params["a"][1, :num_non_de] = 0 sim.params["b"][1, :num_non_de] = 0 sim.params["isDE"] = ("features", ), np.arange(n_genes) >= num_non_de sim.generate_data() sample_description = sim.sample_description test = de.test.pairwise( data=sim.X, grouping="condition", test="z-test", noise_model="nb", sample_description=sample_description, ) summary = test.summary() logging.getLogger("diffxpy").info( 'fraction of non-DE genes with q-value < 0.05: %.1f%%' % float(100 * np.mean( np.sum(test.qval[~np.eye(test.pval.shape[0]). astype(bool), :num_non_de] < 0.05) / (2 * num_non_de)))) logging.getLogger("diffxpy").info( 'fraction of DE genes with q-value < 0.05: %.1f%%' % float(100 * np.mean( np.sum(test.qval[~np.eye(test.pval.shape[0]).astype(bool), num_non_de:] < 0.05) / (2 * (n_genes - num_non_de))))) # TODO asserts return True
def _test_compute_hessians(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.glm_nb import Simulator, InputData else: raise ValueError("noise_model not recognized") num_observations = 500 num_conditions = 2 sim = Simulator(num_observations=num_observations, num_features=4) sim.generate_sample_description(num_conditions=num_conditions, num_batches=2) sim.generate() sample_description = data_utils.sample_description_from_xarray( sim.data, dim="observations") design_loc = data_utils.design_matrix( sample_description, formula="~ 1 + condition + batch") design_scale = data_utils.design_matrix(sample_description, formula="~ 1 + condition") input_data = InputData.new(sim.X, design_loc=design_loc, design_scale=design_scale) logger.debug("* Running analytic Hessian by observation tests") pkg_constants.HESSIAN_MODE = "obs_batched" self.estimator_ob = self.estimate(input_data) t0_ob = time.time() self.H_ob = self.estimator_ob.hessians t1_ob = time.time() self.estimator_ob.close_session() self.t_ob = t1_ob - t0_ob logger.debug("* Running analytic Hessian by feature tests") pkg_constants.HESSIAN_MODE = "feature" self.estimator_fw = self.estimate(input_data) t0_fw = time.time() self.H_fw = self.estimator_fw.hessians t1_fw = time.time() self.estimator_fw.close_session() self.t_fw = t1_fw - t0_fw logger.debug("* Running tensorflow Hessian by feature tests") pkg_constants.HESSIAN_MODE = "tf" self.estimator_tf = self.estimate(input_data) t0_tf = time.time() # tensorflow computes the negative hessian as the # objective is the negative log-likelihood. self.H_tf = self.estimator_tf.hessians t1_tf = time.time() self.estimator_tf.close_session() self.t_tf = t1_tf - t0_tf i = 1 logger.info("run time observation batch-wise analytic solution: %f" % self.t_ob) logger.info("run time feature-wise analytic solution: %f" % self.t_fw) logger.info("run time feature-wise tensorflow solution: %f" % self.t_tf) logger.info( "ratio of tensorflow feature-wise hessian to analytic observation batch-wise hessian:" ) logger.info(self.H_tf.values[i, :, :] / self.H_ob.values[i, :, :]) logger.info( "ratio of tensorflow feature-wise hessian to analytic feature-wise hessian:" ) logger.info(self.H_tf.values[i, :, :] / self.H_fw.values[i, :, :]) max_rel_dev1 = np.max( np.abs((self.H_tf.values - self.H_ob.values) / self.H_tf.values)) max_rel_dev2 = np.max( np.abs((self.H_tf.values - self.H_fw.values) / self.H_tf.values)) assert max_rel_dev1 < 1e-10 assert max_rel_dev2 < 1e-10 return True