def _prepare_data(self, n_cells: int, n_genes: int, noise_model: str): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) num_non_de = n_genes // 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.a_var[1, :num_non_de] = 0 sim.b_var[1, :num_non_de] = 0 self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() return sim
def _test_wald_de( self, constrained: bool, spline_basis: str, ngenes: int ): if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(2, 5, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": from batchglm.api.models import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) n_timepoints = 7 sim = Simulator(num_observations=n_timepoints*200, num_features=ngenes) sim.generate_sample_description( num_batches=0, num_conditions=n_timepoints ) sim.generate_params( rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale ) num_non_de = round(ngenes / 2) sim.a_var[1:, :num_non_de] = 0 # Set all condition effects of non DE genes to zero. sim.b_var[1:, :] = 0 # Use constant dispersion across all conditions. self.isDE = np.arange(ngenes) >= num_non_de sim.generate_data() random_sample_description = sim.sample_description random_sample_description["continuous"] = [int(x) for x in random_sample_description["condition"]] random_sample_description["batch"] = [ str(int(x)) + str(np.random.randint(0, 3)) for x in random_sample_description["continuous"] ] test = de.test.continuous_1d( data=sim.input_data, sample_description=random_sample_description, gene_names=["gene" + str(i) for i in range(sim.input_data.num_features)], formula_loc="~ 1 + continuous + batch" if constrained else "~ 1 + continuous", formula_scale="~ 1", factor_loc_totest="continuous", continuous="continuous", constraints_loc={"batch": "continuous"} if constrained else None, df=5, spline_basis=spline_basis, test="wald", quick_scale=True, noise_model=self.noise_model ) self._eval(sim=sim, test=test)
def _test_interaction(self, ngenes: int, test: str, constrained: bool, spline_basis: str): n_timepoints = 5 sim = Simulator(num_observations=n_timepoints * 200, num_features=ngenes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params() sim.generate_data() random_sample_description = pd.DataFrame({ "continuous": np.asarray(np.random.randint(0, n_timepoints, size=sim.nobs), dtype=float) }) random_sample_description["condition"] = [ str(np.random.randint(0, 2)) for x in random_sample_description["continuous"] ] random_sample_description["batch"] = [ x + str(np.random.randint(0, 3)) for x in random_sample_description["condition"] ] random_sample_description["size_factors"] = np.random.uniform( 0.9, 1.1, sim.nobs) # TODO put into simulation. det = self._fit_continuous_interaction( sim=sim, sample_description=random_sample_description, test=test, constrained=constrained, spline_basis=spline_basis, ) return det
def test(self): """ Check that factors that are numeric receive the correct number of coefficients. :return: """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=2000, num_features=2) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() sample_description = sim.sample_description sample_description["numeric1"] = np.random.random(size=sim.nobs) sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( data=sim.input_data, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", factor_loc_totest="condition", as_numeric=["numeric1", "numeric2"], training_strategy="DEFAULT") # Check that number of coefficients is correct. assert test.model_estim.a_var.shape[0] == 4 return True
def _test_single_full_rank(self): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": from batchglm.api.models import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) sim = Simulator(num_observations=200, num_features=2) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": [str(x) for x in np.random.randint(2, size=sim.nobs)] }) try: random_sample_description["batch"] = random_sample_description["condition"] _ = de.test.wald( data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", noise_model=self.noise_model ) except ValueError as error: logging.getLogger("diffxpy").info(error) else: raise ValueError("rank error was erroneously not thrown on under-determined unconstrained system") try: random_sample_description["batch"] = [ x + str(np.random.randint(0, 2)) for x in random_sample_description["condition"].values ] _ = de.test.wald( data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", constraints_loc={"batch": "condition"}, noise_model=self.noise_model ) except ValueError as error: raise ValueError("rank error was erroneously thrown on defined constrained system")
def _test_null_distribution_wald(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) random_sf = np.random.uniform(0.5, 1.5, sim.nobs) test = de.test.wald(data=sim.input_data, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", size_factors=random_sf, batch_size=500, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64") _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params() sim.generate_data() return sim
def _test_all_moments(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif self.noise_model == "norm": from batchglm.api.models import Simulator elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator else: raise ValueError("noise_model not recognized") self.sim = Simulator(num_observations=100000, num_features=10) self.sim.generate_sample_description(num_batches=1, num_conditions=1) self.sim.generate_params() self.sim.generate_data() success = self.eval_simulation_mean() assert success, "mean of simulation was inaccurate" return True
def test_rank_test_zero_variance(self): """ Test if rank test works if it is given genes with zero variance. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() sim.input_data.x[:, 0] = 0 sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.rank_test(data=sim.input_data, sample_description=random_sample_description, grouping="condition", is_sig_zerovar=True) assert np.isnan(test.pval[0]) and test.pval[1] == 1, \ "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ (test.pval[0], test.pval[1]) return True
def _test_null_distribution_rank(self, n_cells: int, n_genes: int): """ Test if de.test.rank_test() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ from batchglm.api.models.tf1.glm_norm import Simulator sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.rank_test(data=sim.input_data, sample_description=random_sample_description, grouping="condition") _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of rank_test(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def test_forfatal_from_dict(self): """ Test if dictionary-based constraint interface is working. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 2000 n_genes = 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: sample_description = pd.DataFrame({ "cond": ["cond" + str(i // 1000) for i in range(n_cells)], "batch": ["batch" + str(i // 500) for i in range(n_cells)] }) test = de.test.wald(data=sim.input_data, sample_description=sample_description, formula_loc="~1+cond+batch", formula_scale="~1+cond+batch", constraints_loc={"batch": "cond"}, constraints_scale={"batch": "cond"}, coef_to_test=["cond[T.cond1]"]) _ = test.summary()
def get_simulator(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif self.noise_model == "norm": from batchglm.api.models import Simulator elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator else: raise ValueError("noise_model not recognized") return Simulator(num_observations=10000, num_features=10)
class TestSimulationGlmAll: sim: _SimulatorGLM input_data: InputDataGLM noise_model: str def eval_simulation_mean(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": threshold_dev = 1e-2 threshold_std = 1e-1 elif self.noise_model == "norm": threshold_dev = 1e-2 threshold_std = 1e-1 elif self.noise_model == "beta": threshold_dev = 1e-2 threshold_std = 1e-1 else: raise ValueError("noise_model not recognized") means_sim = self.sim.a_var[0, :] means_obs = self.sim.link_loc(np.mean(self.sim.input_data.x, axis=0)) mean_dev = np.mean(means_sim - means_obs) std_dev = np.std(means_sim - means_obs) logging.getLogger("batchglm").info("mean_dev_a %f" % mean_dev) logging.getLogger("batchglm").info("std_dev_a %f" % std_dev) if np.abs(mean_dev) < threshold_dev and \ std_dev < threshold_std: return True else: return False def _test_all_moments(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif self.noise_model == "norm": from batchglm.api.models import Simulator elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator else: raise ValueError("noise_model not recognized") self.sim = Simulator(num_observations=100000, num_features=10) self.sim.generate_sample_description(num_batches=1, num_conditions=1) self.sim.generate_params() self.sim.generate_data() success = self.eval_simulation_mean() assert success, "mean of simulation was inaccurate" return True
def simulate(self, n_cells: int = 200, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.input_data.num_observations) }) return sim.x, random_sample_description
def _test_model_fit_partition( self, n_cells: int, n_genes: int, noise_model: str ): """ Test if de.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.tf1.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), "batch": np.random.randint(2, size=sim.nobs) }) partition = de.fit.partition( data=sim.input_data, sample_description=random_sample_description, parts="condition" ) estim = partition.model( formula_loc="~ 1 + batch", noise_model=noise_model ) return True
def test_forfatal_from_string(self): """ Test if _from_string interface is working. n_cells is constant as the design matrix and constraints depend on it. """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 2000 n_genes = 2 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 6]) dmat[:, 0] = 1 dmat[:500, 1] = 1 # bio rep 1 dmat[500:1000, 2] = 1 # bio rep 2 dmat[1000:1500, 3] = 1 # bio rep 3 dmat[1500:2000, 4] = 1 # bio rep 4 dmat[1000:2000, 5] = 1 # condition effect coefficient_names = [ 'intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'treatment1' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") dmat_est_scale, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc.values, coef_names=dmat_est_loc.columns, constraints=["bio1+bio2=0", "bio3+bio4=0"]) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale.values, coef_names=dmat_est_scale.columns, constraints=["bio1+bio2=0", "bio3+bio4=0"]) test = de.test.wald(data=sim.input_data, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"]) _ = test.summary()
def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): """ Test if de.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate() sample_description = pd.DataFrame({ "covar1": np.random.randint(2, size=sim.nobs), "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition(data=sim.x, parts="cond", sample_description=sample_description) det = partition.lrt(full_formula_loc="~ 1 + covar1", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", training_strategy="DEFAULT", dtype="float64") _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of lrt(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round( pval_h0, 5) return True
def _prepate_data(self, n_cells: int, n_genes: int, n_groups: int): if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape) rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape) elif self.noise_model == "norm" or self.noise_model is None: from batchglm.api.models import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate_params(rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale) sim.generate_data() random_sample_description = pd.DataFrame({ "condition": [str(x) for x in np.random.randint(n_groups, size=sim.nobs)] }) return sim, random_sample_description
def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): """ Test if de.test_wald_loc() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distriubution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) from batchglm.api.models.tf1.glm_nb import Simulator sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(n_groups, size=sim.nobs)}) test = de.test.versus_rest( data=sim.x, grouping="condition", test="wald", noise_model="nb", sample_description=random_sample_description, batch_size=500, training_strategy="DEFAULT", dtype="float64") summary = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round( pval_h0, 5) return True
def _test_null_distribution_lrt(self, n_cells: int, n_genes: int, noise_model: str): """ Test if de.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif noise_model == "norm": from batchglm.api.models.tf1.glm_norm import Simulator else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame( {"condition": np.random.randint(2, size=sim.nobs)}) test = de.test.lrt(data=sim.input_data, sample_description=random_sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", noise_model=noise_model) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of lrt(): %f' % pval_h0) assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)) return True
def test_null_distribution_wald_constrained(self, n_genes: int = 100): """ Test if de.wald() with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 2000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: sample_description = pd.DataFrame({ "cond": ["cond" + str(i // 1000) for i in range(n_cells)], "batch": ["batch" + str(i // 500) for i in range(n_cells)] }) test = de.test.wald(data=sim.input_data, sample_description=sample_description, formula_loc="~1+cond+batch", formula_scale="~1+cond+batch", constraints_loc={"batch": "cond"}, constraints_scale={"batch": "cond"}, coef_to_test=["cond[T.cond1]"]) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True
def simulate(self): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator elif self.noise_model == "norm": from batchglm.api.models import Simulator elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator else: raise ValueError("noise_model not recognized") num_observations = 500 sim = Simulator(num_observations=num_observations, num_features=4) sim.generate_sample_description(num_conditions=2, num_batches=2) sim.generate() self.sim = sim
def test_for_fatal(self): """ """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) sim = Simulator(num_observations=50, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate() test = de.test.wald(data=sim.X, factor_loc_totest="condition", formula_loc="~ 1 + condition", sample_description=sim.sample_description, gene_names=[str(x) for x in range(sim.X.shape[1])], training_strategy="DEFAULT", dtype="float64") # Set up reference gene sets. rs = de.enrich.RefSets() rs.add(id="set1", source="manual", gene_ids=["1", "3"]) rs.add(id="set2", source="manual", gene_ids=["5", "6"]) for i in [True, False]: for j in [True, False]: enrich_test_i = de.enrich.test( ref=rs, det=test, threshold=0.05, incl_all_zero=i, clean_ref=j, ) _ = enrich_test_i.summary() _ = enrich_test_i.significant_set_ids() _ = enrich_test_i.significant_sets() _ = enrich_test_i.set_summary(id="set1") return True
def _test_compute_hessians(self, sparse): if self.noise_model is None: raise ValueError("noise_model is None") else: if self.noise_model == "nb": from batchglm.api.models.tf1.glm_nb import Simulator, InputDataGLM elif self.noise_model == "norm": from batchglm.api.models import Simulator, InputDataGLM elif self.noise_model == "beta": from batchglm.api.models.tf1.glm_beta import Simulator, InputDataGLM else: raise ValueError("noise_model not recognized") num_observations = 500 num_conditions = 2 sim = Simulator(num_observations=num_observations, num_features=4) sim.generate_sample_description(num_conditions=num_conditions, num_batches=2) sim.generate() sample_description = data_utils.sample_description_from_xarray( sim.data, dim="observations") design_loc = data_utils.design_matrix( sample_description, formula="~ 1 + condition + batch") design_scale = data_utils.design_matrix(sample_description, formula="~ 1 + condition") if sparse: input_data = InputDataGLM(data=scipy.sparse.csr_matrix(sim.X), design_loc=design_loc, design_scale=design_scale) else: input_data = InputDataGLM(data=sim.X, design_loc=design_loc, design_scale=design_scale) # Compute hessian based on analytic solution. pkg_constants.HESSIAN_MODE = "analytic" t0_analytic = time.time() h_analytic = self.get_hessians(input_data) t1_analytic = time.time() t_analytic = t1_analytic - t0_analytic # Compute hessian based on tensorflow auto-differentiation. pkg_constants.HESSIAN_MODE = "tf1" t0_tf = time.time() h_tf = self.get_hessians(input_data) t1_tf = time.time() t_tf = t1_tf - t0_tf logging.getLogger("batchglm").info( "run time observation batch-wise analytic solution: %f" % t_analytic) logging.getLogger("batchglm").info("run time tensorflow solution: %f" % t_tf) logging.getLogger("batchglm").info("MAD: %f" % np.max(np.abs((h_tf - h_analytic)))) #i = 1 #print(h_tf[i, :, :]) #print(h_analytic[i, :, :]) #print(h_tf[i, :, :] - h_analytic[i, :, :]) # Make sure that hessians are not all zero which might make evaluation of equality difficult. assert np.sum(np.abs(h_analytic)) > 1e-10, \ "hessians too small to perform test: %f" % np.sum(np.abs(h_analytic)) mad = np.max(np.abs(h_tf - h_analytic)) assert mad < 1e-15, mad return True
def _test_null_distribution_wald_constrained_2layer( self, n_genes: int = 100): """ Test if de.wald() with constraints generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. n_cells is constant as the design matrix and constraints depend on it. :param n_genes: Number of genes to simulate (number of tests). """ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) n_cells = 12000 sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() # Build design matrix: dmat = np.zeros([n_cells, 14]) dmat[:, 0] = 1 dmat[6000:12000, 1] = 1 # condition effect dmat[:1000, 2] = 1 # bio rep 1 - treated 1 dmat[1000:3000, 3] = 1 # bio rep 2 - treated 2 dmat[3000:5000, 4] = 1 # bio rep 3 - treated 3 dmat[5000:6000, 5] = 1 # bio rep 4 - treated 4 dmat[6000:7000, 6] = 1 # bio rep 5 - untreated 1 dmat[7000:9000, 7] = 1 # bio rep 6 - untreated 2 dmat[9000:11000, 8] = 1 # bio rep 7 - untreated 3 dmat[11000:12000, 9] = 1 # bio rep 8 - untreated 4 dmat[1000:2000, 10] = 1 # tech rep 1 dmat[7000:8000, 10] = 1 # tech rep 1 dmat[2000:3000, 11] = 1 # tech rep 2 dmat[8000:9000, 11] = 1 # tech rep 2 dmat[3000:4000, 12] = 1 # tech rep 3 dmat[9000:10000, 12] = 1 # tech rep 3 dmat[4000:5000, 13] = 1 # tech rep 4 dmat[10000:11000, 13] = 1 # tech rep 4 coefficient_names = [ 'intercept', 'treatment1', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6', 'bio7', 'bio8', 'tech1', 'tech2', 'tech3', 'tech4' ] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") dmat_est_scale = de.utils.design_matrix(dmat=dmat_est.iloc[:, [0]], return_type="dataframe") # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc.values, coef_names=dmat_est_loc.columns, constraints=[ "bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0", "bio7+bio8=0", "tech1+tech2=0", "tech3+tech4=0" ]) constraints_scale = None test = de.test.wald(data=sim.input_data, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"]) _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue logging.getLogger("diffxpy").info( 'KS-test pvalue for null model match of wald(): %f' % pval_h0) assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" return True