def job_mmd_opt(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() With optimization. Gaussian kernel. """ data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r+294) Y = datY.data() XY = np.vstack((X, Y)) med = util.meddistance(XY, subsample=1000) # Construct a list of kernels to try based on multiples of the median # heuristic #list_gwidth = np.hstack( (np.linspace(20, 40, 10), (med**2) # *(2.0**np.linspace(-2, 2, 20) ) ) ) list_gwidth = (med**2)*(2.0**np.linspace(-4, 4, 30) ) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r) mmd_result = mmd_opt.perform_test(data, candidate_kernels=candidate_kernels, tr_proportion=tr_proportion, reg=1e-3) return { 'test_result': mmd_result, 'time_secs': t.secs}
def optimize_auto_init(p, dat, J, **ops): """ Optimize parameters by calling optimize_locs_widths(). Automatically initialize the test locations and the Gaussian width. Return optimized locations, Gaussian width, optimization info """ assert J > 0 # Use grid search to initialize the gwidth X = dat.data() n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(X, 1000)**2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), 'gwidth not real. Was %s' % str(gwidth) assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth logging.info('After grid search, gwidth=%.3g' % gwidth) V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths( p, dat, gwidth, V0, **ops) # set the width bounds #fac_min = 5e-2 #fac_max = 5e3 #gwidth_lb = fac_min*med2 #gwidth_ub = fac_max*med2 #gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub)) return V_opt, gwidth_opt, info
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD test with a Gaussian kernel, where the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. p: an UnnormalizedDensity data_source: a DataSource tr, te: Data r: trial number (positive integer) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) V = util.fit_gaussian_draw(X, J, seed=r + 3) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha) fssd_med_result = fssd_med.perform_test(data) return { 'goftest': fssd_med, 'test_result': fssd_med_result, 'time_secs': t.secs }
def job_mmd_med(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) # If p, q differ very little, the median may be very small, rejecting H0 # when it should not? medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) medxy = util.meddistance(XY, subsample=1000) med_avg = (medx + medy + medxy) / 3.0 k = kernel.KGauss(med_avg**2) mmd_test = mgof.QuadMMDGof(p, k, n_permute=400, alpha=alpha, seed=r) mmd_result = mmd_test.perform_test(data) return {'test_result': mmd_result, 'time_secs': t.secs}
def test_ustat_h1_mean_variance(self): seed = 20 # sample n = 200 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) draw_mean = mean + 2 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) fea_tensor = fssd.feature_tensor(X) u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance( fea_tensor) # assertions self.assertGreaterEqual(u_variance, 0) # should reject H0 self.assertGreaterEqual(u_mean, 0)
def numpy(self): """ Return an instance of numpy implementation of itself. """ sigma2 = self.sigma2.item() return gofker.KGauss(sigma2)
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use a Gaussian kernel. """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Use grid search to initialize the gwidth n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(Xtr, 1000)**2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), 'gwidth not real. Was %s' % str(gwidth) assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth logging.info('After grid search, gwidth=%.3g' % gwidth) ops = { 'reg': 1e-2, 'max_iter': 40, 'tol_fun': 1e-4, 'disp': True, 'locs_bounds_frac': 10.0, 'gwidth_lb': 1e-1, 'gwidth_ub': 1e4, } V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth, V0, **ops) # Use the optimized parameters to construct a test k_opt = kernel.KGauss(gwidth_opt) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test(te) return { 'test_result': fssd_opt_result, 'time_secs': t.secs, 'goftest': fssd_opt, 'opt_info': info, }
def perform_test(self, dat, model_sample, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p #ds = p.get_datasource() #p_sample = ds.sample(dat.sample_size(), seed=seed+77) p_sample = model_sample xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results['mmdtest'] = mmdtest results['time_secs'] = t.secs return results
def grid_search_gwidth(p, dat, test_locs, list_gwidth): """ Linear search for the best Gaussian width in the list that maximizes the test power criterion, fixing the test locations. - V: a J x dx np-array for J test locations return: (best width index, list of test power objectives) """ list_gauss_kernel = [kernel.KGauss(gw) for gw in list_gwidth] besti, objs = FSSD.fssd_grid_search_kernel(p, dat, test_locs, list_gauss_kernel) return besti, objs
def test_basic(self): """ Nothing special. Just test basic things. """ # sample n = 10 d = 3 with util.NumpySeedContext(seed=29): X = np.random.randn(n, d) * 3 k = kernel.KGauss(sigma2=1) K = k.eval(X, X) self.assertEqual(K.shape, (n, n)) self.assertTrue(np.all(K >= 0 - 1e-6)) self.assertTrue(np.all(K <= 1 + 1e-6), 'K not bounded by 1')
def power_criterion(p, dat, gwidth, test_locs, reg=1e-2, use_2terms=False): """ use_2terms: True if the objective should include the first term in the power expression. This term carries the test threshold and is difficult to compute (depends on the optimized test locations). If True, then the objective will be -1/(n**0.5*sigma_H1) + n**0.5 FSSD^2/sigma_H1, which ignores the test threshold in the first term. """ k = kernel.KGauss(gwidth) return FSSD.power_criterion(p, dat, k, test_locs, reg, use_2terms=use_2terms)
def test_pair_gradX_Y(self): # sample n = 11 d = 3 with util.NumpySeedContext(seed=20): X = np.random.randn(n, d) * 4 Y = np.random.randn(n, d) * 2 k = kernel.KGauss(sigma2=2.1) # n x d pair_grad = k.pair_gradX_Y(X, Y) loop_grad = np.zeros((n, d)) for i in range(n): for j in range(d): loop_grad[i, j] = k.gradX_Y(X[[i], :], Y[[i], :], j) testing.assert_almost_equal(pair_grad, loop_grad)
def job_lin_kstein_med(p, data_source, tr, te, r): """ Linear-time version of the kernel Stein discrepancy test of Liu et al., 2016 and Chwialkowski et al., 2016. Use full sample. """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) lin_kstein = gof.LinearKernelSteinTest(p, k, alpha=alpha, seed=r) lin_kstein_result = lin_kstein.perform_test(data) return {'test_result': lin_kstein_result, 'time_secs': t.secs}
def job_kstein_med(p, data_source, tr, te, r): """ Kernel Stein discrepancy test of Liu et al., 2016 and Chwialkowski et al., 2016. Use full sample. Use Gaussian kernel. """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med ** 2) kstein = gof.KernelSteinTest(p, k, alpha=args.alpha, n_simulate=1000, seed=r) kstein_result = kstein.perform_test(data) return {'test_result': kstein_result, 'time_secs': t.secs}
def test_gradX_y(self): n = 10 with util.NumpySeedContext(seed=10): for d in [1, 3]: y = np.random.randn(d) * 2 X = np.random.rand(n, d) * 3 sigma2 = 1.3 k = kernel.KGauss(sigma2=sigma2) # n x d G = k.gradX_y(X, y) # check correctness K = k.eval(X, y[np.newaxis, :]) myG = -K / sigma2 * (X - y) self.assertEqual(G.shape, myG.shape) testing.assert_almost_equal(G, myG)
def test_basic(self): d = 3 p = density.IsotropicNormal(mean=np.zeros(d), variance=3.0) q = density.IsotropicNormal(mean=np.zeros(d) + 2, variance=3.0) k = kernel.KGauss(2.0) ds = q.get_datasource() n = 97 dat = ds.sample(n, seed=3) witness = gof.SteinWitness(p, k, dat) # points to evaluate the witness J = 4 V = np.random.randn(J, d) * 2 evals = witness(V) testing.assert_equal(evals.shape, (J, d))
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { 'reg': 1e-2, 'max_iter': 10, 'tol_fun': 1e-3, 'disp': False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = \ gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result['h0_rejected']
def test_gradXY_sum(self): n = 11 with util.NumpySeedContext(seed=12): for d in [3, 1]: X = np.random.randn(n, d) sigma2 = 1.4 k = kernel.KGauss(sigma2=sigma2) # n x n myG = np.zeros((n, n)) K = k.eval(X, X) for i in range(n): for j in range(n): diffi2 = np.sum((X[i, :] - X[j, :])**2) #myG[i, j] = -diffi2*K[i, j]/(sigma2**2)+ d*K[i, j]/sigma2 myG[i, j] = K[i, j] / sigma2 * (d - diffi2 / sigma2) # check correctness G = k.gradXY_sum(X, X) self.assertEqual(G.shape, myG.shape) testing.assert_almost_equal(G, myG)
def test_auto_init_opt_fssd(self): """ Test FSSD-opt test with automatic parameter initialization. """ seed = 5 # sample size n = 191 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 3]: opts = { 'reg': 1e-2, 'max_iter': 10, 'tol_fun': 1e-3, 'disp': False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) V_opt, gw_opt, opt_result = \ gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result['h0_rejected']
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 100 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) # only one dimension of the mean is shifted #draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean + 0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) tresult = fssd.perform_test(dat, return_simulated_stats=True) # assertions self.assertGreaterEqual(tresult['pvalue'], 0) self.assertLessEqual(tresult['pvalue'], 1)
def __init__(self, p, sigma2, V, alpha=0.01, n_simulate=3000, seed=10): k = kernel.KGauss(sigma2) null_sim = FSSDH0SimCovObs(n_simulate=n_simulate, seed=seed) super(GaussFSSD, self).__init__(p, k, V, null_sim, alpha)