Ejemplo n.º 1
0
def test_ksd():
    """Test quadratic time KSD

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/ipynb/gof_kernel_stein.ipynb
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Gaussian kernel with median heuristic
    sig2 = util.meddistance(samples.data(), subsample=1000)**2
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")

    # KSD
    bootstrapper = gof.bootstrapper_rademacher
    kstein = gof.KernelSteinTest(p,
                                 k,
                                 bootstrapper=bootstrapper,
                                 alpha=0.01,
                                 n_simulate=500,
                                 seed=seed + 1)
    test_result = kstein.perform_test(samples,
                                      return_simulated_stats=False,
                                      return_ustat_gram=False)
    print(test_result)
    assert test_result["h0_rejected"] == False

    # KSD with samples from different density
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    sig2 = util.meddistance(samples.data(), subsample=1000)**2
    print(f"Kernel bandwidth: {sig2}")
    k = kernel.KGauss(sig2)
    bootstrapper = gof.bootstrapper_rademacher
    kstein = gof.KernelSteinTest(p,
                                 k,
                                 bootstrapper=bootstrapper,
                                 alpha=0.01,
                                 n_simulate=500,
                                 seed=seed + 1)
    test_result = kstein.perform_test(samples,
                                      return_simulated_stats=False,
                                      return_ustat_gram=False)
    print(test_result)
    assert test_result["h0_rejected"] == True
Ejemplo n.º 2
0
def get_ns_pqsource(prob_label):
    """
    Return (ns, p, ds), a tuple of
    where
    - ns: a list of sample sizes
    - p: a Density representing the distribution p
    - ds: a DataSource, each corresponding to one parameter setting.
        The DataSource generates sample from q.
    """
    gmd_p01_d10_ns = [1000, 3000, 5000]

    # gb_rbm_dx50_dh10_vars = [0, 1e-3, 2e-3, 3e-3]
    prob2tuples = {
        # vary d. P = N(0, I), Q = N( (c,..0), I)
        "gmd_p03_d10_ns": (
            gmd_p01_d10_ns,
            density.IsotropicNormal(np.zeros(10), 1),
            data.DSIsotropicNormal(np.hstack((0.03, np.zeros(10 - 1))), 1),
        ),
        # Gaussian Bernoulli RBM. dx=50, dh=10
        # Perturbation variance to B[0, 0] is 0.1
        "gbrbm_dx50_dh10_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=10),
        # Gaussian Bernoulli RBM. dx=50, dh=40
        # Perturbation variance to B[0, 0] is 0.1
        "gbrbm_dx50_dh40_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=40),
        # Gaussian Bernoulli RBM. dx=50, dh=10
        # No perturbation
        "gbrbm_dx50_dh10_h0": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0, dx=50, dh=10),
        # Gaussian Bernoulli RBM. dx=50, dh=40
        # No perturbation
        "gbrbm_dx50_dh40_h0": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0, dx=50, dh=40),
        # Gaussian Bernoulli RBM. dx=20, dh=10
        # Perturbation variance to B[0, 0] is 0.1
        "gbrbm_dx20_dh10_vp1": ([i * 1000 for i in range(2, 5 + 1)], ) +
        gbrbm_perturb(var_perturb_B=0.1, dx=20, dh=10),
        # Gaussian Bernoulli RBM. dx=20, dh=10
        # No perturbation
        "gbrbm_dx20_dh10_h0": ([i * 1000 for i in range(2, 5 + 1)], ) +
        gbrbm_perturb(var_perturb_B=0, dx=20, dh=10),
    }
    if prob_label not in prob2tuples:
        raise ValueError("Unknown problem label. Need to be one of %s" %
                         str(prob2tuples.keys()))
    return prob2tuples[prob_label]
Ejemplo n.º 3
0
def test_fssd():
    """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Gaussian kernel with median heuristic
    sig2 = util.meddistance(samples.data(), subsample=1000) ** 2
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")

    # FSSD
    J = 10
    null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed)
    # Fit a multivariate normal to the data X (n x d) and draw J points from the fit.
    V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1)
    fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01)
    test_result = fssd_med.perform_test(samples)
    print(test_result)
    assert test_result["h0_rejected"] == False

    # FSSD with samples from different density
    J = 10  # Fails with J=8, passes with J=10 (chance)
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    sig2 = util.meddistance(samples.data(), subsample=1000) ** 2
    # NOTE: Works much better with the bandwidth that was optimized under FSSD:
    # sig2 = 0.3228712361986835
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")
    null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed)
    # TODO: is this what we want if samples come from another distribution ?!
    V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1)
    fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01)
    test_result = fssd_med.perform_test(samples)
    print(test_result)
    assert test_result["h0_rejected"] == True
Ejemplo n.º 4
0
    def test_optimized_fssd(self):
        """
        Test FSSD test with parameter optimization.
        """
        seed = 4
        # sample size
        n = 179
        alpha = 0.01
        for d in [1, 3]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 4]:
                opts = {
                    "reg": 1e-2,
                    "max_iter": 10,
                    "tol_fun": 1e-3,
                    "disp": False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                Xtr = tr.X
                gwidth0 = util.meddistance(Xtr, subsample=1000)**2
                # random test locations
                V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1)
                V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths(
                    p, tr, gwidth0, V0, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result["h0_rejected"]
Ejemplo n.º 5
0
    def test_auto_init_opt_fssd(self):
        """
        Test FSSD-opt test with automatic parameter initialization.
        """
        seed = 5
        # sample size
        n = 191
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 3]:
                opts = {
                    "reg": 1e-2,
                    "max_iter": 10,
                    "tol_fun": 1e-3,
                    "disp": False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init(
                    p, tr, J, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result["h0_rejected"]
Ejemplo n.º 6
0
def test_fssd_opt():
    """Test FSSD with optimized test locations

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/ipynb/demo_kgof.ipynb
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Split dataset
    tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2)

    # Optimization
    opts = {
        "reg": 1e-2,  # regularization parameter in the optimization objective
        "max_iter": 50,  # maximum number of gradient ascent iterations
        "tol_fun": 1e-7,  # termination tolerance of the objective
    }
    # J is the number of test locations (or features). Typically not larger than 10
    J = 1
    V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)
    print(V_opt)
    print(f"Kernel bandwidth: {gw_opt}")
    print(opt_info)

    # FSSD
    fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01)
    test_result = fssd_opt.perform_test(te)
    test_result
    print(test_result)
    assert test_result["h0_rejected"] == False

    # FSSD with samples from different density
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2)
    opts = {
        "reg": 1e-2,  # regularization parameter in the optimization objective
        "max_iter": 50,  # maximum number of gradient ascent iterations
        "tol_fun": 1e-7,  # termination tolerance of the objective
    }
    J = 1  # J is the number of test locations (or features)
    V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)
    print(f"Kernel bandwidth: {gw_opt}")

    # FSSD
    fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01)
    test_result = fssd_opt.perform_test(te)
    print(test_result)
    assert test_result["h0_rejected"] == True
Ejemplo n.º 7
0
def get_pqsource_list(prob_label):
    """
    Return [(prob_param, p, ds) for ... ], a list of tuples
    where
    - prob_param: a problem parameters. Each parameter has to be a
      scalar (so that we can plot them later). Parameters are preferably
      positive integers.
    - p: a Density representing the distribution p
    - ds: a DataSource, each corresponding to one parameter setting.
        The DataSource generates sample from q.
    """
    sg_ds = [1, 5, 10, 15]
    gmd_ds = [5, 20, 40, 60]
    # vary the mean
    gmd_d10_ms = [0, 0.02, 0.04, 0.06]
    gvinc_d1_vs = [1, 1.5, 2, 2.5]
    gvinc_d5_vs = [1, 1.5, 2, 2.5]
    gvsub1_d1_vs = [0.1, 0.3, 0.5, 0.7]
    gvd_ds = [1, 5, 10, 15]

    # gb_rbm_dx50_dh10_stds = [0, 0.01, 0.02, 0.03]
    gb_rbm_dx50_dh10_stds = [0, 0.02, 0.04, 0.06]
    # gb_rbm_dx50_dh10_stds = [0]
    gb_rbm_dx50_dh40_stds = [0, 0.01, 0.02, 0.04, 0.06]
    glaplace_ds = [1, 5, 10, 15]
    prob2tuples = {
        # H0 is true. vary d. P = Q = N(0, I)
        "sg": [(
            d,
            density.IsotropicNormal(np.zeros(d), 1),
            data.DSIsotropicNormal(np.zeros(d), 1),
        ) for d in sg_ds],
        # vary d. P = N(0, I), Q = N( (c,..0), I)
        "gmd": [(
            d,
            density.IsotropicNormal(np.zeros(d), 1),
            data.DSIsotropicNormal(np.hstack((1, np.zeros(d - 1))), 1),
        ) for d in gmd_ds],
        # P = N(0, I), Q = N( (m, ..0), I). Vary m
        "gmd_d10_ms": [(
            m,
            density.IsotropicNormal(np.zeros(10), 1),
            data.DSIsotropicNormal(np.hstack((m, np.zeros(9))), 1),
        ) for m in gmd_d10_ms],
        # d=1. Increase the variance. P = N(0, I). Q = N(0, v*I)
        "gvinc_d1": [(
            var,
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsotropicNormal(np.zeros(1), var),
        ) for var in gvinc_d1_vs],
        # d=5. Increase the variance. P = N(0, I). Q = N(0, v*I)
        "gvinc_d5": [(
            var,
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsotropicNormal(np.zeros(5), var),
        ) for var in gvinc_d5_vs],
        # d=1. P=N(0,1), Q(0,v). Consider the variance below 1.
        "gvsub1_d1": [(
            var,
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsotropicNormal(np.zeros(1), var),
        ) for var in gvsub1_d1_vs],
        # Gaussian variance difference problem. Only the variance
        # of the first dimenion differs. d varies.
        "gvd": [(
            d,
            density.Normal(np.zeros(d), np.eye(d)),
            data.DSNormal(np.zeros(d), np.diag(np.hstack(
                (2, np.ones(d - 1))))),
        ) for d in gvd_ds],
        # Gaussian Bernoulli RBM. dx=50, dh=10
        "gbrbm_dx50_dh10":
        gaussbern_rbm_probs(gb_rbm_dx50_dh10_stds, dx=50, dh=10,
                            n=sample_size),
        # Gaussian Bernoulli RBM. dx=50, dh=40
        "gbrbm_dx50_dh40":
        gaussbern_rbm_probs(gb_rbm_dx50_dh40_stds, dx=50, dh=40,
                            n=sample_size),
        # p: N(0, I), q: standard Laplace. Vary d
        "glaplace": [
            (
                d,
                density.IsotropicNormal(np.zeros(d), 1),
                # Scaling of 1/sqrt(2) will make the variance 1.
                data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)),
            ) for d in glaplace_ds
        ],
    }
    if prob_label not in prob2tuples:
        raise ValueError("Unknown problem label. Need to be one of %s" %
                         str(prob2tuples.keys()))
    return prob2tuples[prob_label]
Ejemplo n.º 8
0
 def get_datasource(self):
     return data.DSIsotropicNormal(self.mean, self.variance)
Ejemplo n.º 9
0
def get_pqsource(prob_label):
    """
    Return (p, ds), a tuple of
    - p: a Density representing the distribution p
    - ds: a DataSource, each corresponding to one parameter setting.
        The DataSource generates sample from q.
    """
    prob2tuples = {
        # H0 is true. vary d. P = Q = N(0, I)
        "sg5": (
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsotropicNormal(np.zeros(5), 1),
        ),
        # P = N(0, I), Q = N( (0.2,..0), I)
        "gmd5": (
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsotropicNormal(np.hstack((0.2, np.zeros(4))), 1),
        ),
        "gmd1": (
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsotropicNormal(np.ones(1) * 0.2, 1),
        ),
        # P = N(0, I), Q = N( (1,..0), I)
        "gmd100": (
            density.IsotropicNormal(np.zeros(100), 1),
            data.DSIsotropicNormal(np.hstack((1, np.zeros(99))), 1),
        ),
        # Gaussian variance difference problem. Only the variance
        # of the first dimenion differs. d varies.
        "gvd5": (
            density.Normal(np.zeros(5), np.eye(5)),
            data.DSNormal(np.zeros(5), np.diag(np.hstack((2, np.ones(4))))),
        ),
        "gvd10": (
            density.Normal(np.zeros(10), np.eye(10)),
            data.DSNormal(np.zeros(10), np.diag(np.hstack((2, np.ones(9))))),
        ),
        # Gaussian Bernoulli RBM. dx=50, dh=10. H0 is true
        "gbrbm_dx50_dh10_v0":
        gaussbern_rbm_tuple(0, dx=50, dh=10, n=sample_size),
        # Gaussian Bernoulli RBM. dx=5, dh=3. H0 is true
        "gbrbm_dx5_dh3_v0":
        gaussbern_rbm_tuple(0, dx=5, dh=3, n=sample_size),
        # Gaussian Bernoulli RBM. dx=50, dh=10.
        "gbrbm_dx50_dh10_v1em3":
        gaussbern_rbm_tuple(1e-3, dx=50, dh=10, n=sample_size),
        # Gaussian Bernoulli RBM. dx=5, dh=3. Perturb with noise = 1e-2.
        "gbrbm_dx5_dh3_v5em3":
        gaussbern_rbm_tuple(5e-3, dx=5, dh=3, n=sample_size),
        # Gaussian mixture of two components. Uniform mixture weights.
        # p = 0.5*N(0, 1) + 0.5*N(3, 0.01)
        # q = 0.5*N(-3, 0.01) + 0.5*N(0, 1)
        "gmm_d1": (
            density.IsoGaussianMixture(np.array([[0], [3.0]]),
                                       np.array([1, 0.01])),
            data.DSIsoGaussianMixture(np.array([[-3.0], [0]]),
                                      np.array([0.01, 1])),
        ),
        # p = N(0, 1)
        # q = 0.1*N([-10, 0,..0], 0.001) + 0.9*N([0,0,..0], 1)
        "g_vs_gmm_d5": (
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsoGaussianMixture(
                np.vstack((np.hstack((0.0, np.zeros(4))), np.zeros(5))),
                np.array([0.0001, 1]),
                pmix=[0.1, 0.9],
            ),
        ),
        "g_vs_gmm_d2": (
            density.IsotropicNormal(np.zeros(2), 1),
            data.DSIsoGaussianMixture(
                np.vstack((np.hstack((0.0, np.zeros(1))), np.zeros(2))),
                np.array([0.01, 1]),
                pmix=[0.1, 0.9],
            ),
        ),
        "g_vs_gmm_d1": (
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsoGaussianMixture(np.array([[0.0], [0]]),
                                      np.array([0.01, 1]),
                                      pmix=[0.1, 0.9]),
        ),
    }
    if prob_label not in prob2tuples:
        raise ValueError("Unknown problem label. Need to be one of %s" %
                         str(prob2tuples.keys()))
    return prob2tuples[prob_label]