def job_mmd_med(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) # If p, q differ very little, the median may be very small, rejecting H0 # when it should not? # If p, q differ very little, the median may be very small, rejecting H0 # when it should not? medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) medxy = util.meddistance(XY, subsample=1000) med_avg = (medx + medy + medxy) / 3.0 k = kernel.KGauss(med_avg**2) mmd_test = mgof.QuadMMDGof(p, k, n_permute=400, alpha=alpha, seed=r) mmd_result = mmd_test.perform_test(data) return {"test_result": mmd_result, "time_secs": t.secs}
def test_ksd(): """Test quadratic time KSD Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/ipynb/gof_kernel_stein.ipynb """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000)**2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # KSD bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == False # KSD with samples from different density ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000)**2 print(f"Kernel bandwidth: {sig2}") k = kernel.KGauss(sig2) bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == True
def test_fssd(): """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # FSSD J = 10 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed) # Fit a multivariate normal to the data X (n x d) and draw J points from the fit. V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density J = 10 # Fails with J=8, passes with J=10 (chance) ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 # NOTE: Works much better with the bandwidth that was optimized under FSSD: # sig2 = 0.3228712361986835 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed) # TODO: is this what we want if samples come from another distribution ?! V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == True
def optimize_auto_init(p, dat, J, **ops): """ Optimize parameters by calling optimize_locs_widths(). Automatically initialize the test locations and the Gaussian width. Return optimized locations, Gaussian width, optimization info """ assert J > 0 # Use grid search to initialize the gwidth X = dat.data() n_gwidth_cand = 5 gwidth_factors = 2.0 ** np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(X, 1000) ** 2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num(gwidth), "gwidth not real. Was %s" % str(gwidth) assert gwidth > 0, "gwidth not positive. Was %.3g" % gwidth logging.info("After grid search, gwidth=%.3g" % gwidth) V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths( p, dat, gwidth, V0, **ops ) # set the width bounds # fac_min = 5e-2 # fac_max = 5e3 # gwidth_lb = fac_min*med2 # gwidth_ub = fac_max*med2 # gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub)) return V_opt, gwidth_opt, info
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD test with a Gaussian kernel, where the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. p: an UnnormalizedDensity data_source: a DataSource tr, te: Data r: trial number (positive integer) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) V = util.fit_gaussian_draw(X, J, seed=r + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha) fssd_med_result = fssd_med.perform_test(data) return {"test_result": fssd_med_result, "time_secs": t.secs}
def job_mmd_opt(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() With optimization. Gaussian kernel. """ data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) med = util.meddistance(XY, subsample=1000) # Construct a list of kernels to try based on multiples of the median # heuristic # list_gwidth = np.hstack( (np.linspace(20, 40, 10), (med**2) # *(2.0**np.linspace(-2, 2, 20) ) ) ) list_gwidth = (med**2) * (2.0**np.linspace(-4, 4, 30)) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r) mmd_result = mmd_opt.perform_test( data, candidate_kernels=candidate_kernels, tr_proportion=tr_proportion, reg=1e-3, ) return {"test_result": mmd_result, "time_secs": t.secs}
def test_ustat_h1_mean_variance(self): seed = 20 # sample n = 200 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) draw_mean = mean + 2 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) fea_tensor = fssd.feature_tensor(X) u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance( fea_tensor) # assertions self.assertGreaterEqual(u_variance, 0) # should reject H0 self.assertGreaterEqual(u_mean, 0)
def perform_test( self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3, ): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results["mmdtest"] = mmdtest results["time_secs"] = t.secs return results
def job_lin_kstein_med(p, data_source, tr, te, r): """ Linear-time version of the kernel Stein discrepancy test of Liu et al., 2016 and Chwialkowski et al., 2016. Use full sample. """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) lin_kstein = gof.LinearKernelSteinTest(p, k, alpha=alpha, seed=r) lin_kstein_result = lin_kstein.perform_test(data) return {"test_result": lin_kstein_result, "time_secs": t.secs}
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use a Gaussian kernel. """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Use grid search to initialize the gwidth n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(Xtr, 1000)**2 k = kernel.KGauss(med2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), "gwidth not real. Was %s" % str(gwidth) assert gwidth > 0, "gwidth not positive. Was %.3g" % gwidth logging.info("After grid search, gwidth=%.3g" % gwidth) ops = { "reg": 1e-2, "max_iter": 30, "tol_fun": 1e-5, "disp": True, "locs_bounds_frac": 30.0, "gwidth_lb": 1e-1, "gwidth_ub": 1e4, } V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth, V0, **ops) # Use the optimized parameters to construct a test k_opt = kernel.KGauss(gwidth_opt) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test(te) return { "test_result": fssd_opt_result, "time_secs": t.secs, "goftest": fssd_opt, "opt_info": info, }
def job_mmd_dgauss_opt(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() With optimization. Diagonal Gaussian kernel where there is one Gaussian width for each dimension. """ data = tr + te X = data.data() d = X.shape[1] with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) # Get the median heuristic for each dimension meds = np.zeros(d) for i in range(d): medi = util.meddistance(XY[:, [i]], subsample=1000) meds[i] = medi # Construct a list of kernels to try based on multiples of the median # heuristic med_factors = 2.0**np.linspace(-4, 4, 20) candidate_kernels = [] for i in range(len(med_factors)): ki = kernel.KDiagGauss((meds**2) * med_factors[i]) candidate_kernels.append(ki) mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r + 56) mmd_result = mmd_opt.perform_test( data, candidate_kernels=candidate_kernels, tr_proportion=tr_proportion, reg=1e-3, ) return {"test_result": mmd_result, "time_secs": t.secs}
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def ksd_gaussian_kernel( log_prob_grad_fn: Callable, samples: torch.Tensor, sig2: Optional[float] = None, seed: int = 101, ) -> torch.Tensor: """KSD test with `kgof` package Args: log_prob_grad_fn: Function returning the gradient of the log probability. It receives torch.Tensors as inputs and should output a torch.Tensor as well. samples: Samples for the test as a torch.Tensor Returns: The test result is returned """ density = UnnormalizedDensityWrapped(log_prob_grad=log_prob_grad_fn) samples = DataWrapped(samples) if sig2 is None: sig2 = meddistance(samples.data(), subsample=1000)**2 else: sig2 = float(sig2) kernel = KGauss(sig2) kstein = KernelSteinTest( density, kernel, bootstrapper=bootstrapper_rademacher, alpha=0.01, n_simulate=500, seed=seed + 1, ) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) # log.info(f"H0 rejected: {test_result['h0_rejected']}") # TODO: normalize by sample size? return torch.tensor(test_result["test_stat"])
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 100 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) # only one dimension of the mean is shifted # draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean + 0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) tresult = fssd.perform_test(dat, return_simulated_stats=True) # assertions self.assertGreaterEqual(tresult["pvalue"], 0) self.assertLessEqual(tresult["pvalue"], 1)
def optimize_locs_widths( p, dat, gwidth0, test_locs0, reg=1e-2, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100, gwidth_lb=None, gwidth_ub=None, use_2terms=False, ): """ Optimize the test locations and the Gaussian kernel width by maximizing a test power criterion. data should not be the same data as used in the actual test (i.e., should be a held-out set). This function is deterministic. - data: a Data object - test_locs0: Jxd numpy array. Initial V. - reg: reg to add to the mean/sqrt(variance) criterion to become mean/sqrt(variance + reg) - gwidth0: initial value of the Gaussian width^2 - max_iter: #gradient descent iterations - tol_fun: termination tolerance of the objective value - disp: True to print convergence messages - locs_bounds_frac: When making box bounds for the test_locs, extend the box defined by coordinate-wise min-max by std of each coordinate multiplied by this number. - gwidth_lb: absolute lower bound on the Gaussian width^2 - gwidth_ub: absolute upper bound on the Gaussian width^2 - use_2terms: If True, then besides the signal-to-noise ratio criterion, the objective function will also include the first term that is dropped. #- If the lb, ub bounds are None, use fraction of the median heuristics # to automatically set the bounds. Return (V test_locs, gaussian width, optimization info log) """ J = test_locs0.shape[0] X = dat.data() n, d = X.shape # Parameterize the Gaussian width with its square root (then square later) # to automatically enforce the positivity. def obj(sqrt_gwidth, V): return -GaussFSSD.power_criterion( p, dat, sqrt_gwidth ** 2, V, reg=reg, use_2terms=use_2terms ) flatten = lambda gwidth, V: np.hstack((gwidth, V.reshape(-1))) def unflatten(x): sqrt_gwidth = x[0] V = np.reshape(x[1:], (J, d)) return sqrt_gwidth, V def flat_obj(x): sqrt_gwidth, V = unflatten(x) return obj(sqrt_gwidth, V) # gradient # grad_obj = autograd.elementwise_grad(flat_obj) # Initial point x0 = flatten(np.sqrt(gwidth0), test_locs0) # make sure that the optimized gwidth is not too small or too large. fac_min = 1e-2 fac_max = 1e2 med2 = util.meddistance(X, subsample=1000) ** 2 if gwidth_lb is None: gwidth_lb = max(fac_min * med2, 1e-3) if gwidth_ub is None: gwidth_ub = min(fac_max * med2, 1e5) # Make a box to bound test locations X_std = np.std(X, axis=0) # X_min: length-d array X_min = np.min(X, axis=0) X_max = np.max(X, axis=0) # V_lb: J x d V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1)) V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1)) # (J*d+1) x 2. Take square root because we parameterize with the square # root x0_lb = np.hstack((np.sqrt(gwidth_lb), np.reshape(V_lb, -1))) x0_ub = np.hstack((np.sqrt(gwidth_ub), np.reshape(V_ub, -1))) x0_bounds = list(zip(x0_lb, x0_ub)) # optimize. Time the optimization as well. # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html grad_obj = autograd.elementwise_grad(flat_obj) with util.ContextTimer() as timer: opt_result = scipy.optimize.minimize( flat_obj, x0, method="L-BFGS-B", bounds=x0_bounds, tol=tol_fun, options={ "maxiter": max_iter, "ftol": tol_fun, "disp": disp, "gtol": 1.0e-07, }, jac=grad_obj, ) opt_result = dict(opt_result) opt_result["time_secs"] = timer.secs x_opt = opt_result["x"] sq_gw_opt, V_opt = unflatten(x_opt) gw_opt = sq_gw_opt ** 2 assert util.is_real_num(gw_opt), "gw_opt is not real. Was %s" % str(gw_opt) return V_opt, gw_opt, opt_result