def test_ksd(): """Test quadratic time KSD Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/ipynb/gof_kernel_stein.ipynb """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000)**2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # KSD bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == False # KSD with samples from different density ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000)**2 print(f"Kernel bandwidth: {sig2}") k = kernel.KGauss(sig2) bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == True
def get_ns_pqsource(prob_label): """ Return (ns, p, ds), a tuple of where - ns: a list of sample sizes - p: a Density representing the distribution p - ds: a DataSource, each corresponding to one parameter setting. The DataSource generates sample from q. """ gmd_p01_d10_ns = [1000, 3000, 5000] # gb_rbm_dx50_dh10_vars = [0, 1e-3, 2e-3, 3e-3] prob2tuples = { # vary d. P = N(0, I), Q = N( (c,..0), I) "gmd_p03_d10_ns": ( gmd_p01_d10_ns, density.IsotropicNormal(np.zeros(10), 1), data.DSIsotropicNormal(np.hstack((0.03, np.zeros(10 - 1))), 1), ), # Gaussian Bernoulli RBM. dx=50, dh=10 # Perturbation variance to B[0, 0] is 0.1 "gbrbm_dx50_dh10_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=10), # Gaussian Bernoulli RBM. dx=50, dh=40 # Perturbation variance to B[0, 0] is 0.1 "gbrbm_dx50_dh40_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=40), # Gaussian Bernoulli RBM. dx=50, dh=10 # No perturbation "gbrbm_dx50_dh10_h0": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0, dx=50, dh=10), # Gaussian Bernoulli RBM. dx=50, dh=40 # No perturbation "gbrbm_dx50_dh40_h0": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0, dx=50, dh=40), # Gaussian Bernoulli RBM. dx=20, dh=10 # Perturbation variance to B[0, 0] is 0.1 "gbrbm_dx20_dh10_vp1": ([i * 1000 for i in range(2, 5 + 1)], ) + gbrbm_perturb(var_perturb_B=0.1, dx=20, dh=10), # Gaussian Bernoulli RBM. dx=20, dh=10 # No perturbation "gbrbm_dx20_dh10_h0": ([i * 1000 for i in range(2, 5 + 1)], ) + gbrbm_perturb(var_perturb_B=0, dx=20, dh=10), } if prob_label not in prob2tuples: raise ValueError("Unknown problem label. Need to be one of %s" % str(prob2tuples.keys())) return prob2tuples[prob_label]
def test_fssd(): """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # FSSD J = 10 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed) # Fit a multivariate normal to the data X (n x d) and draw J points from the fit. V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density J = 10 # Fails with J=8, passes with J=10 (chance) ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 # NOTE: Works much better with the bandwidth that was optimized under FSSD: # sig2 = 0.3228712361986835 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed) # TODO: is this what we want if samples come from another distribution ?! V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == True
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_auto_init_opt_fssd(self): """ Test FSSD-opt test with automatic parameter initialization. """ seed = 5 # sample size n = 191 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 3]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init( p, tr, J, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_fssd_opt(): """Test FSSD with optimized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/ipynb/demo_kgof.ipynb """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Split dataset tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2) # Optimization opts = { "reg": 1e-2, # regularization parameter in the optimization objective "max_iter": 50, # maximum number of gradient ascent iterations "tol_fun": 1e-7, # termination tolerance of the objective } # J is the number of test locations (or features). Typically not larger than 10 J = 1 V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts) print(V_opt) print(f"Kernel bandwidth: {gw_opt}") print(opt_info) # FSSD fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01) test_result = fssd_opt.perform_test(te) test_result print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2) opts = { "reg": 1e-2, # regularization parameter in the optimization objective "max_iter": 50, # maximum number of gradient ascent iterations "tol_fun": 1e-7, # termination tolerance of the objective } J = 1 # J is the number of test locations (or features) V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts) print(f"Kernel bandwidth: {gw_opt}") # FSSD fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01) test_result = fssd_opt.perform_test(te) print(test_result) assert test_result["h0_rejected"] == True
def get_pqsource_list(prob_label): """ Return [(prob_param, p, ds) for ... ], a list of tuples where - prob_param: a problem parameters. Each parameter has to be a scalar (so that we can plot them later). Parameters are preferably positive integers. - p: a Density representing the distribution p - ds: a DataSource, each corresponding to one parameter setting. The DataSource generates sample from q. """ sg_ds = [1, 5, 10, 15] gmd_ds = [5, 20, 40, 60] # vary the mean gmd_d10_ms = [0, 0.02, 0.04, 0.06] gvinc_d1_vs = [1, 1.5, 2, 2.5] gvinc_d5_vs = [1, 1.5, 2, 2.5] gvsub1_d1_vs = [0.1, 0.3, 0.5, 0.7] gvd_ds = [1, 5, 10, 15] # gb_rbm_dx50_dh10_stds = [0, 0.01, 0.02, 0.03] gb_rbm_dx50_dh10_stds = [0, 0.02, 0.04, 0.06] # gb_rbm_dx50_dh10_stds = [0] gb_rbm_dx50_dh40_stds = [0, 0.01, 0.02, 0.04, 0.06] glaplace_ds = [1, 5, 10, 15] prob2tuples = { # H0 is true. vary d. P = Q = N(0, I) "sg": [( d, density.IsotropicNormal(np.zeros(d), 1), data.DSIsotropicNormal(np.zeros(d), 1), ) for d in sg_ds], # vary d. P = N(0, I), Q = N( (c,..0), I) "gmd": [( d, density.IsotropicNormal(np.zeros(d), 1), data.DSIsotropicNormal(np.hstack((1, np.zeros(d - 1))), 1), ) for d in gmd_ds], # P = N(0, I), Q = N( (m, ..0), I). Vary m "gmd_d10_ms": [( m, density.IsotropicNormal(np.zeros(10), 1), data.DSIsotropicNormal(np.hstack((m, np.zeros(9))), 1), ) for m in gmd_d10_ms], # d=1. Increase the variance. P = N(0, I). Q = N(0, v*I) "gvinc_d1": [( var, density.IsotropicNormal(np.zeros(1), 1), data.DSIsotropicNormal(np.zeros(1), var), ) for var in gvinc_d1_vs], # d=5. Increase the variance. P = N(0, I). Q = N(0, v*I) "gvinc_d5": [( var, density.IsotropicNormal(np.zeros(5), 1), data.DSIsotropicNormal(np.zeros(5), var), ) for var in gvinc_d5_vs], # d=1. P=N(0,1), Q(0,v). Consider the variance below 1. "gvsub1_d1": [( var, density.IsotropicNormal(np.zeros(1), 1), data.DSIsotropicNormal(np.zeros(1), var), ) for var in gvsub1_d1_vs], # Gaussian variance difference problem. Only the variance # of the first dimenion differs. d varies. "gvd": [( d, density.Normal(np.zeros(d), np.eye(d)), data.DSNormal(np.zeros(d), np.diag(np.hstack( (2, np.ones(d - 1))))), ) for d in gvd_ds], # Gaussian Bernoulli RBM. dx=50, dh=10 "gbrbm_dx50_dh10": gaussbern_rbm_probs(gb_rbm_dx50_dh10_stds, dx=50, dh=10, n=sample_size), # Gaussian Bernoulli RBM. dx=50, dh=40 "gbrbm_dx50_dh40": gaussbern_rbm_probs(gb_rbm_dx50_dh40_stds, dx=50, dh=40, n=sample_size), # p: N(0, I), q: standard Laplace. Vary d "glaplace": [ ( d, density.IsotropicNormal(np.zeros(d), 1), # Scaling of 1/sqrt(2) will make the variance 1. data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)), ) for d in glaplace_ds ], } if prob_label not in prob2tuples: raise ValueError("Unknown problem label. Need to be one of %s" % str(prob2tuples.keys())) return prob2tuples[prob_label]
def get_datasource(self): return data.DSIsotropicNormal(self.mean, self.variance)
def get_pqsource(prob_label): """ Return (p, ds), a tuple of - p: a Density representing the distribution p - ds: a DataSource, each corresponding to one parameter setting. The DataSource generates sample from q. """ prob2tuples = { # H0 is true. vary d. P = Q = N(0, I) "sg5": ( density.IsotropicNormal(np.zeros(5), 1), data.DSIsotropicNormal(np.zeros(5), 1), ), # P = N(0, I), Q = N( (0.2,..0), I) "gmd5": ( density.IsotropicNormal(np.zeros(5), 1), data.DSIsotropicNormal(np.hstack((0.2, np.zeros(4))), 1), ), "gmd1": ( density.IsotropicNormal(np.zeros(1), 1), data.DSIsotropicNormal(np.ones(1) * 0.2, 1), ), # P = N(0, I), Q = N( (1,..0), I) "gmd100": ( density.IsotropicNormal(np.zeros(100), 1), data.DSIsotropicNormal(np.hstack((1, np.zeros(99))), 1), ), # Gaussian variance difference problem. Only the variance # of the first dimenion differs. d varies. "gvd5": ( density.Normal(np.zeros(5), np.eye(5)), data.DSNormal(np.zeros(5), np.diag(np.hstack((2, np.ones(4))))), ), "gvd10": ( density.Normal(np.zeros(10), np.eye(10)), data.DSNormal(np.zeros(10), np.diag(np.hstack((2, np.ones(9))))), ), # Gaussian Bernoulli RBM. dx=50, dh=10. H0 is true "gbrbm_dx50_dh10_v0": gaussbern_rbm_tuple(0, dx=50, dh=10, n=sample_size), # Gaussian Bernoulli RBM. dx=5, dh=3. H0 is true "gbrbm_dx5_dh3_v0": gaussbern_rbm_tuple(0, dx=5, dh=3, n=sample_size), # Gaussian Bernoulli RBM. dx=50, dh=10. "gbrbm_dx50_dh10_v1em3": gaussbern_rbm_tuple(1e-3, dx=50, dh=10, n=sample_size), # Gaussian Bernoulli RBM. dx=5, dh=3. Perturb with noise = 1e-2. "gbrbm_dx5_dh3_v5em3": gaussbern_rbm_tuple(5e-3, dx=5, dh=3, n=sample_size), # Gaussian mixture of two components. Uniform mixture weights. # p = 0.5*N(0, 1) + 0.5*N(3, 0.01) # q = 0.5*N(-3, 0.01) + 0.5*N(0, 1) "gmm_d1": ( density.IsoGaussianMixture(np.array([[0], [3.0]]), np.array([1, 0.01])), data.DSIsoGaussianMixture(np.array([[-3.0], [0]]), np.array([0.01, 1])), ), # p = N(0, 1) # q = 0.1*N([-10, 0,..0], 0.001) + 0.9*N([0,0,..0], 1) "g_vs_gmm_d5": ( density.IsotropicNormal(np.zeros(5), 1), data.DSIsoGaussianMixture( np.vstack((np.hstack((0.0, np.zeros(4))), np.zeros(5))), np.array([0.0001, 1]), pmix=[0.1, 0.9], ), ), "g_vs_gmm_d2": ( density.IsotropicNormal(np.zeros(2), 1), data.DSIsoGaussianMixture( np.vstack((np.hstack((0.0, np.zeros(1))), np.zeros(2))), np.array([0.01, 1]), pmix=[0.1, 0.9], ), ), "g_vs_gmm_d1": ( density.IsotropicNormal(np.zeros(1), 1), data.DSIsoGaussianMixture(np.array([[0.0], [0]]), np.array([0.01, 1]), pmix=[0.1, 0.9]), ), } if prob_label not in prob2tuples: raise ValueError("Unknown problem label. Need to be one of %s" % str(prob2tuples.keys())) return prob2tuples[prob_label]