def test_ksd(): """Test quadratic time KSD Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/ipynb/gof_kernel_stein.ipynb """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000)**2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # KSD bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == False # KSD with samples from different density ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000)**2 print(f"Kernel bandwidth: {sig2}") k = kernel.KGauss(sig2) bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == True
def test_fssd(): """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # FSSD J = 10 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed) # Fit a multivariate normal to the data X (n x d) and draw J points from the fit. V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density J = 10 # Fails with J=8, passes with J=10 (chance) ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 # NOTE: Works much better with the bandwidth that was optimized under FSSD: # sig2 = 0.3228712361986835 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed) # TODO: is this what we want if samples come from another distribution ?! V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == True
def job_mmd_med(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) # If p, q differ very little, the median may be very small, rejecting H0 # when it should not? # If p, q differ very little, the median may be very small, rejecting H0 # when it should not? medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) medxy = util.meddistance(XY, subsample=1000) med_avg = (medx + medy + medxy) / 3.0 k = kernel.KGauss(med_avg**2) mmd_test = mgof.QuadMMDGof(p, k, n_permute=400, alpha=alpha, seed=r) mmd_result = mmd_test.perform_test(data) return {"test_result": mmd_result, "time_secs": t.secs}
def job_mmd_opt(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() With optimization. Gaussian kernel. """ data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) med = util.meddistance(XY, subsample=1000) # Construct a list of kernels to try based on multiples of the median # heuristic # list_gwidth = np.hstack( (np.linspace(20, 40, 10), (med**2) # *(2.0**np.linspace(-2, 2, 20) ) ) ) list_gwidth = (med**2) * (2.0**np.linspace(-4, 4, 30)) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r) mmd_result = mmd_opt.perform_test( data, candidate_kernels=candidate_kernels, tr_proportion=tr_proportion, reg=1e-3, ) return {"test_result": mmd_result, "time_secs": t.secs}
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD test with a Gaussian kernel, where the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. p: an UnnormalizedDensity data_source: a DataSource tr, te: Data r: trial number (positive integer) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) V = util.fit_gaussian_draw(X, J, seed=r + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha) fssd_med_result = fssd_med.perform_test(data) return {"test_result": fssd_med_result, "time_secs": t.secs}
def optimize_auto_init(p, dat, J, **ops): """ Optimize parameters by calling optimize_locs_widths(). Automatically initialize the test locations and the Gaussian width. Return optimized locations, Gaussian width, optimization info """ assert J > 0 # Use grid search to initialize the gwidth X = dat.data() n_gwidth_cand = 5 gwidth_factors = 2.0 ** np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(X, 1000) ** 2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num(gwidth), "gwidth not real. Was %s" % str(gwidth) assert gwidth > 0, "gwidth not positive. Was %.3g" % gwidth logging.info("After grid search, gwidth=%.3g" % gwidth) V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths( p, dat, gwidth, V0, **ops ) # set the width bounds # fac_min = 5e-2 # fac_max = 5e3 # gwidth_lb = fac_min*med2 # gwidth_ub = fac_max*med2 # gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub)) return V_opt, gwidth_opt, info
def test_ustat_h1_mean_variance(self): seed = 20 # sample n = 200 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) draw_mean = mean + 2 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) fea_tensor = fssd.feature_tensor(X) u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance( fea_tensor) # assertions self.assertGreaterEqual(u_variance, 0) # should reject H0 self.assertGreaterEqual(u_mean, 0)
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use a Gaussian kernel. """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Use grid search to initialize the gwidth n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(Xtr, 1000)**2 k = kernel.KGauss(med2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), "gwidth not real. Was %s" % str(gwidth) assert gwidth > 0, "gwidth not positive. Was %.3g" % gwidth logging.info("After grid search, gwidth=%.3g" % gwidth) ops = { "reg": 1e-2, "max_iter": 30, "tol_fun": 1e-5, "disp": True, "locs_bounds_frac": 30.0, "gwidth_lb": 1e-1, "gwidth_ub": 1e4, } V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth, V0, **ops) # Use the optimized parameters to construct a test k_opt = kernel.KGauss(gwidth_opt) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test(te) return { "test_result": fssd_opt_result, "time_secs": t.secs, "goftest": fssd_opt, "opt_info": info, }
def power_criterion(p, dat, gwidth, test_locs, reg=1e-2, use_2terms=False): """ use_2terms: True if the objective should include the first term in the power expression. This term carries the test threshold and is difficult to compute (depends on the optimized test locations). If True, then the objective will be -1/(n**0.5*sigma_H1) + n**0.5 FSSD^2/sigma_H1, which ignores the test threshold in the first term. """ k = kernel.KGauss(gwidth) return FSSD.power_criterion(p, dat, k, test_locs, reg, use_2terms=use_2terms)
def perform_test( self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3, ): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results["mmdtest"] = mmdtest results["time_secs"] = t.secs return results
def grid_search_gwidth(p, dat, test_locs, list_gwidth): """ Linear search for the best Gaussian width in the list that maximizes the test power criterion, fixing the test locations. - V: a J x dx np-array for J test locations return: (best width index, list of test power objectives) """ list_gauss_kernel = [kernel.KGauss(gw) for gw in list_gwidth] besti, objs = FSSD.fssd_grid_search_kernel(p, dat, test_locs, list_gauss_kernel) return besti, objs
def test_basic(self): """ Nothing special. Just test basic things. """ # sample n = 10 d = 3 with util.NumpySeedContext(seed=29): X = np.random.randn(n, d) * 3 k = kernel.KGauss(sigma2=1) K = k.eval(X, X) self.assertEqual(K.shape, (n, n)) self.assertTrue(np.all(K >= 0 - 1e-6)) self.assertTrue(np.all(K <= 1 + 1e-6), "K not bounded by 1")
def test_pair_gradX_Y(self): # sample n = 11 d = 3 with util.NumpySeedContext(seed=20): X = np.random.randn(n, d) * 4 Y = np.random.randn(n, d) * 2 k = kernel.KGauss(sigma2=2.1) # n x d pair_grad = k.pair_gradX_Y(X, Y) loop_grad = np.zeros((n, d)) for i in range(n): for j in range(d): loop_grad[i, j] = k.gradX_Y(X[[i], :], Y[[i], :], j) testing.assert_almost_equal(pair_grad, loop_grad)
def job_lin_kstein_med(p, data_source, tr, te, r): """ Linear-time version of the kernel Stein discrepancy test of Liu et al., 2016 and Chwialkowski et al., 2016. Use full sample. """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) lin_kstein = gof.LinearKernelSteinTest(p, k, alpha=alpha, seed=r) lin_kstein_result = lin_kstein.perform_test(data) return {"test_result": lin_kstein_result, "time_secs": t.secs}
def test_basic(self): d = 3 p = density.IsotropicNormal(mean=np.zeros(d), variance=3.0) q = density.IsotropicNormal(mean=np.zeros(d) + 2, variance=3.0) k = kernel.KGauss(2.0) ds = q.get_datasource() n = 97 dat = ds.sample(n, seed=3) witness = gof.SteinWitness(p, k, dat) # points to evaluate the witness J = 4 V = np.random.randn(J, d) * 2 evals = witness(V) testing.assert_equal(evals.shape, (J, d))
def test_gradX_y(self): n = 10 with util.NumpySeedContext(seed=10): for d in [1, 3]: y = np.random.randn(d) * 2 X = np.random.rand(n, d) * 3 sigma2 = 1.3 k = kernel.KGauss(sigma2=sigma2) # n x d G = k.gradX_y(X, y) # check correctness K = k.eval(X, y[np.newaxis, :]) myG = -K / sigma2 * (X - y) self.assertEqual(G.shape, myG.shape) testing.assert_almost_equal(G, myG)
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_auto_init_opt_fssd(self): """ Test FSSD-opt test with automatic parameter initialization. """ seed = 5 # sample size n = 191 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 3]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init( p, tr, J, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_gradXY_sum(self): n = 11 with util.NumpySeedContext(seed=12): for d in [3, 1]: X = np.random.randn(n, d) sigma2 = 1.4 k = kernel.KGauss(sigma2=sigma2) # n x n myG = np.zeros((n, n)) K = k.eval(X, X) for i in range(n): for j in range(n): diffi2 = np.sum((X[i, :] - X[j, :])**2) # myG[i, j] = -diffi2*K[i, j]/(sigma2**2)+ d*K[i, j]/sigma2 myG[i, j] = K[i, j] / sigma2 * (d - diffi2 / sigma2) # check correctness G = k.gradXY_sum(X, X) self.assertEqual(G.shape, myG.shape) testing.assert_almost_equal(G, myG)
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 100 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) # only one dimension of the mean is shifted # draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean + 0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) tresult = fssd.perform_test(dat, return_simulated_stats=True) # assertions self.assertGreaterEqual(tresult["pvalue"], 0) self.assertLessEqual(tresult["pvalue"], 1)
def __init__(self, p, sigma2, V, alpha=0.01, n_simulate=3000, seed=10): k = kernel.KGauss(sigma2) null_sim = FSSDH0SimCovObs(n_simulate=n_simulate, seed=seed) super(GaussFSSD, self).__init__(p, k, V, null_sim, alpha)