Example #1
0
    def _prepare_data(self, n_cells: int, n_genes: int, noise_model: str):
        """

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        :param noise_model: Noise model to use for data fitting.
        """
        if noise_model == "nb":
            from batchglm.api.models.glm_nb import Simulator
            rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape)
            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
        elif noise_model == "norm":
            from batchglm.api.models.glm_norm import Simulator
            rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape)
            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
        else:
            raise ValueError("noise model %s not recognized" % noise_model)

        num_non_de = n_genes // 2
        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        sim.generate_params(rand_fn_loc=rand_fn_loc,
                            rand_fn_scale=rand_fn_scale)
        sim.a_var[1, :num_non_de] = 0
        sim.b_var[1, :num_non_de] = 0
        self.isDE = np.arange(n_genes) >= num_non_de
        sim.generate_data()
        return sim
    def test(self):
        """
        Check that factors that are numeric receive the correct number of coefficients.

        :return:
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=2000, num_features=2)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        sim.generate_params()
        sim.generate_data()

        sample_description = sim.sample_description
        sample_description["numeric1"] = np.random.random(size=sim.nobs)
        sample_description["numeric2"] = np.random.random(size=sim.nobs)

        test = de.test.wald(
            data=sim.input_data,
            sample_description=sample_description,
            formula_loc="~ 1 + condition + numeric1 + numeric2",
            formula_scale="~ 1",
            factor_loc_totest="condition",
            as_numeric=["numeric1", "numeric2"],
            training_strategy="DEFAULT")
        # Check that number of coefficients is correct.
        assert test.model_estim.a_var.shape[0] == 4

        return True
Example #3
0
    def get_simulator(self):
        if self.noise_model is None:
            raise ValueError("noise_model is None")
        else:
            if self.noise_model == "nb":
                from batchglm.api.models.glm_nb import Simulator
            else:
                raise ValueError("noise_model not recognized")

        return Simulator(num_observations=1000, num_features=50)
    def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100):
        """

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        sim.generate_params()
        sim.generate_data()

        return sim
    def _test_null_distribution_wald(self, n_cells: int, n_genes: int,
                                     noise_model: str):
        """
        Test if de.wald() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed 
        p-value distribution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        :param noise_model: Noise model to use for data fitting.
        """
        if noise_model == "nb":
            from batchglm.api.models.glm_nb import Simulator
            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
        elif noise_model == "norm":
            from batchglm.api.models.glm_norm import Simulator
            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
        else:
            raise ValueError("noise model %s not recognized" % noise_model)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate_params(rand_fn_scale=rand_fn_scale)
        sim.generate_data()

        random_sample_description = pd.DataFrame({
            "condition":
            np.random.randint(2, size=sim.nobs),
            "batch":
            np.random.randint(2, size=sim.nobs)
        })

        test = de.test.wald(data=sim.input_data,
                            sample_description=random_sample_description,
                            factor_loc_totest="condition",
                            formula_loc="~ 1 + condition + batch",
                            batch_size=500,
                            noise_model=noise_model,
                            training_strategy="DEFAULT",
                            dtype="float64")
        _ = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" %
                                np.round(pval_h0, 5))

        return True
Example #6
0
    def _test_residuals_fit(self, n_cells: int, n_genes: int,
                            noise_model: str):
        """
        Test if de.wald() (multivariate mode) generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        :param noise_model: Noise model to use for data fitting.
        """
        if noise_model == "nb":
            from batchglm.api.models.glm_nb import Simulator
        elif noise_model == "norm":
            from batchglm.api.models.glm_norm import Simulator
        else:
            raise ValueError("noise model %s not recognized" % noise_model)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame({
            "condition":
            np.random.randint(2, size=sim.nobs),
            "batch":
            np.random.randint(2, size=sim.nobs)
        })

        res = de.fit.residuals(data=sim.input_data,
                               sample_description=random_sample_description,
                               formula_loc="~ 1 + condition + batch",
                               noise_model=noise_model)
        return True
Example #7
0
    def test_forfatal_functions(self):
        """
        Test if de.test.continuous() DifferentialExpressionTestSingle object functions work fine.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        num_observations = 10
        num_features = 2

        sim = Simulator(num_observations=num_observations,
                        num_features=num_features)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame({
            "pseudotime":
            np.random.random(size=sim.num_observations),
            "batch":
            np.random.randint(2, size=sim.num_observations)
        })

        test = de.test.continuous_1d(
            data=sim.X,
            continuous="pseudotime",
            df=3,
            formula_loc="~ 1 + pseudotime + batch",
            formula_scale="~ 1",
            factor_loc_totest="pseudotime",
            test="wald",
            sample_description=random_sample_description,
            quick_scale=True,
            batch_size=None,
            training_strategy="DEFAULT",
            dtype="float64")

        summary = test.summary()
        ids = test.gene_ids

        # 1. Test all additional functions which depend on model computation:
        # 1.1. Only continuous model:
        temp = test.log_fold_change(genes=ids, nonnumeric=False)
        temp = test.max(genes=ids, nonnumeric=False)
        temp = test.min(genes=ids, nonnumeric=False)
        temp = test.argmax(genes=ids, nonnumeric=False)
        temp = test.argmin(genes=ids, nonnumeric=False)
        temp = test.summary(nonnumeric=False)
        # 1.2. Full model:
        temp = test.log_fold_change(genes=ids, nonnumeric=True)
        temp = test.max(genes=ids, nonnumeric=True)
        temp = test.min(genes=ids, nonnumeric=True)
        temp = test.argmax(genes=ids, nonnumeric=True)
        temp = test.argmin(genes=ids, nonnumeric=True)
        temp = test.summary(nonnumeric=True)

        return True
    def _test_null_distribution_rank(self, n_cells: int, n_genes: int):
        """
        Test if de.test.rank_test() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        from batchglm.api.models.glm_norm import Simulator

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(2, size=sim.nobs)})

        test = de.test.rank_test(data=sim.input_data,
                                 sample_description=random_sample_description,
                                 grouping="condition")
        _ = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of rank_test(): %f' % pval_h0)
        assert pval_h0 > 0.05, ("KS-Test failed: pval_h0=%f is <= 0.05!" %
                                np.round(pval_h0, 5))

        return True
Example #9
0
    def test_rank_test_zero_variance(self):
        """
        Test if rank test works if it is given genes with zero variance.
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        np.random.seed(1)
        sim = Simulator(num_observations=1000, num_features=10)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()
        sim.input_data.x[:, 0] = 0
        sim.input_data.x[:, 1] = 5

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(2, size=sim.nobs)})

        test = de.test.rank_test(data=sim.input_data,
                                 sample_description=random_sample_description,
                                 grouping="condition",
                                 is_sig_zerovar=True)

        assert np.isnan(test.pval[0]) and test.pval[1] == 1, \
            "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
            (test.pval[0], test.pval[1])
        return True
Example #10
0
    def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100):
        """

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        sim.generate_params(
            rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1,
            rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)))
        sim.generate_data()

        return sim
Example #11
0
    def simulate(self, n_cells: int = 20, n_genes: int = 2):
        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(2, size=sim.num_observations)})
        return sim.X, random_sample_description
Example #12
0
    def test_null_distribution_wald_constrained(self, n_genes: int = 100):
        """
        Test if de.wald() with constraints generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        n_cells is constant as the design matrix and constraints depend on it.

        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        n_cells = 2000

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        # Build design matrix:
        sample_description = pd.DataFrame({
            "cond": ["cond" + str(i // 1000) for i in range(n_cells)],
            "batch": ["batch" + str(i // 500) for i in range(n_cells)]
        })

        # Build constraints:
        dmat_loc, constraints_loc = de.utils.constraint_matrix_from_dict(
            sample_description=sample_description,
            formula="~1+cond+batch",
            constraints={"batch": "cond"},
            dims=["design_loc_params", "loc_params"])
        dmat_scale, constraints_scale = de.utils.constraint_matrix_from_dict(
            sample_description=sample_description,
            formula="~1+cond+batch",
            constraints={"batch": "cond"},
            dims=["design_scale_params", "scale_params"])

        test = de.test.wald(data=sim.x,
                            dmat_loc=dmat_loc,
                            dmat_scale=dmat_scale,
                            constraints_loc=constraints_loc,
                            constraints_scale=constraints_scale,
                            coef_to_test=["cond[T.cond1]"])
        _ = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return True
Example #13
0
    def _prepate_data(self, n_cells: int, n_genes: int, n_groups: int):
        if self.noise_model == "nb":
            from batchglm.api.models.glm_nb import Simulator
            rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape)
            rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape)
        elif self.noise_model == "norm" or self.noise_model is None:
            from batchglm.api.models.glm_norm import Simulator
            rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape)
            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
        else:
            raise ValueError("noise model %s not recognized" %
                             self.noise_model)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate_params(rand_fn_loc=rand_fn_loc,
                            rand_fn_scale=rand_fn_scale)
        sim.generate_data()

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(n_groups, size=sim.nobs)})
        return sim, random_sample_description
    def simulate(self):
        if self.noise_model is None:
            raise ValueError("noise_model is None")
        else:
            if self.noise_model == "nb":
                from batchglm.api.models.glm_nb import Simulator
            else:
                raise ValueError("noise_model not recognized")

        num_observations = 500
        sim = Simulator(num_observations=num_observations, num_features=4)
        sim.generate_sample_description(num_conditions=2, num_batches=2)
        sim.generate()

        self.sim = sim
Example #15
0
    def test_forfatal_from_string(self):
        """
        Test if _from_string interface is working.

        n_cells is constant as the design matrix and constraints depend on it.
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        n_cells = 2000
        n_genes = 2

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        # Build design matrix:
        dmat = np.zeros([n_cells, 6])
        dmat[:, 0] = 1
        dmat[:500, 1] = 1  # bio rep 1
        dmat[500:1000, 2] = 1  # bio rep 2
        dmat[1000:1500, 3] = 1  # bio rep 3
        dmat[1500:2000, 4] = 1  # bio rep 4
        dmat[1000:2000, 5] = 1  # condition effect
        coefficient_names = [
            'intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'treatment1'
        ]
        dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names)

        dmat_est_loc = de.utils.design_matrix(dmat=dmat_est)
        dmat_est_scale = de.utils.design_matrix(dmat=dmat_est)

        # Build constraints:
        constraints_loc = de.utils.constraint_matrix_from_string(
            dmat=dmat_est_loc, constraints=["bio1+bio2=0", "bio3+bio4=0"])
        constraints_scale = de.utils.constraint_matrix_from_string(
            dmat=dmat_est_scale, constraints=["bio1+bio2=0", "bio3+bio4=0"])

        test = de.test.wald(data=sim.x,
                            dmat_loc=dmat_est_loc,
                            dmat_scale=dmat_est_scale,
                            constraints_loc=constraints_loc,
                            constraints_scale=constraints_scale,
                            coef_to_test=["treatment1"])
        _ = test.summary()
Example #16
0
    def test_null_distribution_wald(self,
                                    n_cells: int = 2000,
                                    n_genes: int = 100,
                                    n_groups: int = 2):
        """
        Test if de.test_wald_loc() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distriubution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame({
            "condition":
            np.random.randint(n_groups, size=sim.num_observations)
        })

        test = de.test.versus_rest(
            data=sim.X,
            grouping="condition",
            test="wald",
            noise_model="nb",
            sample_description=random_sample_description,
            batch_size=500,
            training_strategy="DEFAULT",
            dtype="float64")
        summary = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of test_wald_loc(): %f' %
            pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return True
Example #17
0
    def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200):
        """
        Test if de.lrt() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        sim.generate()

        sample_description = pd.DataFrame({
            "covar1": np.random.randint(2, size=sim.nobs),
            "covar2": np.random.randint(2, size=sim.nobs)
        })
        sample_description["cond"] = sim.sample_description["condition"].values

        partition = de.test.partition(
            data=sim.x,
            parts="cond",
            sample_description=sample_description
        )
        det = partition.lrt(
            full_formula_loc="~ 1 + covar1",
            full_formula_scale="~ 1",
            reduced_formula_loc="~ 1",
            reduced_formula_scale="~ 1",
            training_strategy="DEFAULT",
            dtype="float64"
        )
        _ = det.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue

        logging.getLogger("diffxpy").info('KS-test pvalue for null model match of lrt(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)

        return True
Example #18
0
    def test_null_distribution_wald(self,
                                    n_cells: int = 2000,
                                    n_genes: int = 100):
        """
        Test if de.test.continuous() generates a uniform p-value distribution in the wald test
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed 
        p-value distriubution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.INFO)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame(
            {"pseudotime": np.random.random(size=sim.num_observations)})

        test = de.test.continuous_1d(
            data=sim.X,
            continuous="pseudotime",
            df=3,
            formula_loc="~ 1 + pseudotime",
            formula_scale="~ 1",
            factor_loc_totest="pseudotime",
            test="wald",
            sample_description=random_sample_description,
            quick_scale=True,
            batch_size=None,
            training_strategy="DEFAULT",
            dtype="float64")
        summary = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return True
Example #19
0
    def test_null_distribution_z_lazy(self,
                                      n_cells: int = 2000,
                                      n_genes: int = 100):
        """
        Test if de.pairwise() generates a uniform p-value distribution for lazy z-tests
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distriubution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(4, size=sim.num_observations)})

        test = de.test.pairwise(data=sim.X,
                                grouping="condition",
                                test='z-test',
                                lazy=True,
                                noise_model="nb",
                                pval_correction="global",
                                quick_scale=True,
                                sample_description=random_sample_description,
                                dtype="float64")

        # Compare p-value distribution under null model against uniform distribution.
        pvals = test.pval_pairs(groups0=0, groups1=1)
        pval_h0 = stats.kstest(pvals.flatten(), 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return True
Example #20
0
    def test_forfatal_from_dict(self):
        """
        Test if dictionary-based constraint interface is working.
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        n_cells = 2000
        n_genes = 2

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        # Build design matrix:
        sample_description = pd.DataFrame({
            "cond": ["cond" + str(i // 1000) for i in range(n_cells)],
            "batch": ["batch" + str(i // 500) for i in range(n_cells)]
        })

        # Build constraints:
        dmat_loc, constraints_loc = de.utils.constraint_matrix_from_dict(
            sample_description=sample_description,
            formula="~1+cond+batch",
            constraints={"batch": "cond"},
            dims=["design_loc_params", "loc_params"])
        dmat_scale, constraints_scale = de.utils.constraint_matrix_from_dict(
            sample_description=sample_description,
            formula="~1+cond+batch",
            constraints={"batch": "cond"},
            dims=["design_scale_params", "scale_params"])

        test = de.test.wald(data=sim.x,
                            dmat_loc=dmat_loc,
                            dmat_scale=dmat_scale,
                            constraints_loc=constraints_loc,
                            constraints_scale=constraints_scale,
                            coef_to_test=["cond[T.cond1]"])
        _ = test.summary()
Example #21
0
    def test_sparse_anndata(self, n_cells: int = 2000, n_genes: int = 100):
        """
        Test if de.wald() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(2, size=sim.num_observations)})

        adata = anndata.AnnData(scipy.sparse.csr_matrix(sim.X.values))
        # X = adata.X
        test = de.test.wald(data=adata,
                            factor_loc_totest="condition",
                            formula="~ 1 + condition",
                            sample_description=random_sample_description,
                            quick_scale=True,
                            training_strategy="DEFAULT",
                            dtype="float64")
        summary = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return True
Example #22
0
    def test_for_fatal(self):
        """
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=50, num_features=10)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        sim.generate()

        test = de.test.wald(data=sim.X,
                            factor_loc_totest="condition",
                            formula_loc="~ 1 + condition",
                            sample_description=sim.sample_description,
                            gene_names=[str(x) for x in range(sim.X.shape[1])],
                            training_strategy="DEFAULT",
                            dtype="float64")

        # Set up reference gene sets.
        rs = de.enrich.RefSets()
        rs.add(id="set1", source="manual", gene_ids=["1", "3"])
        rs.add(id="set2", source="manual", gene_ids=["5", "6"])

        for i in [True, False]:
            for j in [True, False]:
                enrich_test_i = de.enrich.test(
                    ref=rs,
                    det=test,
                    threshold=0.05,
                    incl_all_zero=i,
                    clean_ref=j,
                )
                _ = enrich_test_i.summary()
                _ = enrich_test_i.significant_set_ids()
                _ = enrich_test_i.significant_sets()
                _ = enrich_test_i.set_summary(id="set1")

        return True
Example #23
0
    def test_t_test_zero_variance(self,
                                  n_cells: int = 2000,
                                  n_genes: int = 100):
        """
        Test if de.t_test() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()
        sim.data.X[:, 0] = np.exp(sim.a)[0, 0]

        random_sample_description = pd.DataFrame(
            {"condition": np.random.randint(2, size=sim.num_observations)})

        test = de.test.t_test(data=sim.X,
                              grouping="condition",
                              sample_description=random_sample_description)

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        print('KS-test pvalue for null model match of t_test(): %f' % pval_h0)

        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return pval_h0
Example #24
0
    def test_null_distribution_wald_constrained_2layer(self,
                                                       n_genes: int = 100):
        """
        Test if de.wald() with constraints generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        n_cells is constant as the design matrix and constraints depend on it.

        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        n_cells = 12000

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        # Build design matrix:
        dmat = np.zeros([n_cells, 14])
        dmat[:, 0] = 1
        dmat[6000:12000, 1] = 1  # condition effect
        dmat[:1000, 2] = 1  # bio rep 1 - treated 1
        dmat[1000:3000, 3] = 1  # bio rep 2 - treated 2
        dmat[3000:5000, 4] = 1  # bio rep 3 - treated 3
        dmat[5000:6000, 5] = 1  # bio rep 4 - treated 4
        dmat[6000:7000, 6] = 1  # bio rep 5 - untreated 1
        dmat[7000:9000, 7] = 1  # bio rep 6 - untreated 2
        dmat[9000:11000, 8] = 1  # bio rep 7 - untreated 3
        dmat[11000:12000, 9] = 1  # bio rep 8 - untreated 4
        dmat[1000:2000, 10] = 1  # tech rep 1
        dmat[7000:8000, 10] = 1  # tech rep 1
        dmat[2000:3000, 11] = 1  # tech rep 2
        dmat[8000:9000, 11] = 1  # tech rep 2
        dmat[3000:4000, 12] = 1  # tech rep 3
        dmat[9000:10000, 12] = 1  # tech rep 3
        dmat[4000:5000, 13] = 1  # tech rep 4
        dmat[10000:11000, 13] = 1  # tech rep 4

        coefficient_names = [
            'intercept', 'treatment1', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5',
            'bio6', 'bio7', 'bio8', 'tech1', 'tech2', 'tech3', 'tech4'
        ]
        dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names)

        dmat_est_loc = de.test.design_matrix(dmat=dmat_est)
        dmat_est_scale = de.test.design_matrix(dmat=dmat_est.iloc[:, [0]])

        # Build constraints:
        constraints_loc = de.utils.data_utils.build_equality_constraints_string(
            dmat=dmat_est_loc,
            constraints=[
                "bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0", "bio7+bio8=0",
                "tech1+tech2=0", "tech3+tech4=0"
            ],
            dims=["design_loc_params", "loc_params"])
        constraints_scale = None

        test = de.test.wald(data=sim.X,
                            dmat_loc=dmat_est_loc.data_vars['design'],
                            dmat_scale=dmat_est_scale.data_vars['design'],
                            init_a="standard",
                            init_b="standard",
                            constraints_loc=constraints_loc,
                            constraints_scale=constraints_scale,
                            coef_to_test=["treatment1"],
                            training_strategy="DEFAULT",
                            quick_scale=False,
                            dtype="float64")
        summary = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"

        return True
Example #25
0
    def test_null_distribution_wald_multi_constrained_2layer(
            self, n_genes: int = 50):
        """
        Test if de.wald() for multiple coefficients with constraints
        generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distribution and a uniform distribution.

        n_cells is constant as the design matrix and constraints depend on it.

        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        n_cells = 3000

        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=0)
        sim.generate()

        # Build design matrix:
        dmat = np.zeros([n_cells, 9])
        dmat[:, 0] = 1
        dmat[:500, 1] = 1  # bio rep 1
        dmat[500:1000, 2] = 1  # bio rep 2
        dmat[1000:1500, 3] = 1  # bio rep 3
        dmat[1500:2000, 4] = 1  # bio rep 4
        dmat[2000:2500, 5] = 1  # bio rep 5
        dmat[2500:3000, 6] = 1  # bio rep 6
        dmat[1000:2000, 7] = 1  # condition effect 1
        dmat[2000:3000, 8] = 1  # condition effect 2
        coefficient_names = [
            'intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6',
            'treatment1', 'treatment2'
        ]
        dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names)

        dmat_est_loc = de.utils.design_matrix(dmat=dmat_est)
        dmat_est_scale = de.utils.design_matrix(dmat=dmat_est)

        # Build constraints:
        constraints_loc = de.utils.constraint_matrix_from_string(
            dmat=dmat_est_loc,
            constraints=["bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0"])
        constraints_scale = de.utils.constraint_matrix_from_string(
            dmat=dmat_est_scale,
            constraints=["bio1+bio2=0", "bio3+bio4=0", "bio5+bio6=0"])

        test = de.test.wald(data=sim.x,
                            dmat_loc=dmat_est_loc,
                            dmat_scale=dmat_est_scale,
                            constraints_loc=constraints_loc,
                            constraints_scale=constraints_scale,
                            coef_to_test=["treatment1", "treatment2"])
        _ = test.summary()

        # Compare p-value distribution under null model against uniform distribution.
        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue

        logging.getLogger("diffxpy").info(
            'KS-test pvalue for null model match of wald(): %f' % pval_h0)
        assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % pval_h0

        return True
Example #26
0
    def test_ztest_de(self, n_cells: int = 2000, n_genes: int = 500):
        """
        Test if de.lrt() generates a uniform p-value distribution
        if it is given data simulated based on the null model. Returns the p-value
        of the two-side Kolmgorov-Smirnov test for equality of the observed
        p-value distriubution and a uniform distribution.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.WARNING)

        num_non_de = n_genes // 2
        sim = Simulator(num_observations=n_cells, num_features=n_genes)
        sim.generate_sample_description(num_batches=0, num_conditions=2)
        # simulate: coefficients ~ log(N(1, 0.5)).
        # re-sample if N(1, 0.5) <= 0
        sim.generate_params(rand_fn=lambda shape: 1 + stats.truncnorm.rvs(
            -1 / 0.5, np.infty, scale=0.5, size=shape))
        sim.params["a"][1, :num_non_de] = 0
        sim.params["b"][1, :num_non_de] = 0
        sim.params["isDE"] = ("features", ), np.arange(n_genes) >= num_non_de
        sim.generate_data()

        sample_description = sim.sample_description

        test = de.test.pairwise(
            data=sim.X,
            grouping="condition",
            test="z-test",
            noise_model="nb",
            sample_description=sample_description,
        )
        summary = test.summary()

        logging.getLogger("diffxpy").info(
            'fraction of non-DE genes with q-value < 0.05: %.1f%%' %
            float(100 * np.mean(
                np.sum(test.qval[~np.eye(test.pval.shape[0]).
                                 astype(bool), :num_non_de] < 0.05) /
                (2 * num_non_de))))
        logging.getLogger("diffxpy").info(
            'fraction of DE genes with q-value < 0.05: %.1f%%' %
            float(100 * np.mean(
                np.sum(test.qval[~np.eye(test.pval.shape[0]).astype(bool),
                                 num_non_de:] < 0.05) /
                (2 * (n_genes - num_non_de)))))

        # TODO asserts
        return True
Example #27
0
    def _test_compute_hessians(self):
        if self.noise_model is None:
            raise ValueError("noise_model is None")
        else:
            if self.noise_model == "nb":
                from batchglm.api.models.glm_nb import Simulator, InputData
            else:
                raise ValueError("noise_model not recognized")

        num_observations = 500
        num_conditions = 2

        sim = Simulator(num_observations=num_observations, num_features=4)
        sim.generate_sample_description(num_conditions=num_conditions,
                                        num_batches=2)
        sim.generate()

        sample_description = data_utils.sample_description_from_xarray(
            sim.data, dim="observations")
        design_loc = data_utils.design_matrix(
            sample_description, formula="~ 1 + condition + batch")
        design_scale = data_utils.design_matrix(sample_description,
                                                formula="~ 1 + condition")

        input_data = InputData.new(sim.X,
                                   design_loc=design_loc,
                                   design_scale=design_scale)

        logger.debug("* Running analytic Hessian by observation tests")
        pkg_constants.HESSIAN_MODE = "obs_batched"
        self.estimator_ob = self.estimate(input_data)
        t0_ob = time.time()
        self.H_ob = self.estimator_ob.hessians
        t1_ob = time.time()
        self.estimator_ob.close_session()
        self.t_ob = t1_ob - t0_ob

        logger.debug("* Running analytic Hessian by feature tests")
        pkg_constants.HESSIAN_MODE = "feature"
        self.estimator_fw = self.estimate(input_data)
        t0_fw = time.time()
        self.H_fw = self.estimator_fw.hessians
        t1_fw = time.time()
        self.estimator_fw.close_session()
        self.t_fw = t1_fw - t0_fw

        logger.debug("* Running tensorflow Hessian by feature tests")
        pkg_constants.HESSIAN_MODE = "tf"
        self.estimator_tf = self.estimate(input_data)
        t0_tf = time.time()
        # tensorflow computes the negative hessian as the
        # objective is the negative log-likelihood.
        self.H_tf = self.estimator_tf.hessians
        t1_tf = time.time()
        self.estimator_tf.close_session()
        self.t_tf = t1_tf - t0_tf

        i = 1
        logger.info("run time observation batch-wise analytic solution: %f" %
                    self.t_ob)
        logger.info("run time feature-wise analytic solution: %f" % self.t_fw)
        logger.info("run time feature-wise tensorflow solution: %f" %
                    self.t_tf)
        logger.info(
            "ratio of tensorflow feature-wise hessian to analytic observation batch-wise hessian:"
        )
        logger.info(self.H_tf.values[i, :, :] / self.H_ob.values[i, :, :])
        logger.info(
            "ratio of tensorflow feature-wise hessian to analytic feature-wise hessian:"
        )
        logger.info(self.H_tf.values[i, :, :] / self.H_fw.values[i, :, :])

        max_rel_dev1 = np.max(
            np.abs((self.H_tf.values - self.H_ob.values) / self.H_tf.values))
        max_rel_dev2 = np.max(
            np.abs((self.H_tf.values - self.H_fw.values) / self.H_tf.values))
        assert max_rel_dev1 < 1e-10
        assert max_rel_dev2 < 1e-10
        return True