def _get_metest_opt(self, dat, op=None): seed = self.seed if op is None: op = { 'n_test_locs': self.n_locs, 'seed': seed + 5, 'max_iter': 100, 'batch_proportion': 1.0, 'locs_step_size': 1.0, 'gwidth_step_size': 0.1, 'tol_fun': 1e-4, 'reg': 1e-6 } seed = self.seed alpha = self.alpha p = self.p # Draw sample from p. #sample to draw is the same as that of dat ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed) xtr, xte = p_sample.split_tr_te(tr_proportion=self.tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=self.tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) # Train the ME test V_opt, gw2_opt, _ = tst.MeanEmbeddingTest.optimize_locs_width( tr_tst_data, alpha, **op) metest = tst.MeanEmbeddingTest(V_opt, gw2_opt, alpha) return metest, tr_tst_data, te_tst_data
def get_H1_mean_variance(self, dat, return_variance=True): """ Return the mean and variance under H1 of the test statistic = \sqrt{n}(UME(P, R)^2 - UME(Q, R))^2. The estimator of the mean is unbiased (can be negative). The variance is also valid under H0. :returns: (mean, variance) If return_variance is False, :returns: mean """ umep = self.umep umeq = self.umeq # form a two-sample test dataset between datap and dat (data from R) Z = dat.data() datapr = tstdata.TSTData(self.datap.data(), Z) dataqr = tstdata.TSTData(self.dataq.data(), Z) # get the feature matrices (correlated) fea_pr = umep.feature_matrix(datapr) # n x Jp fea_qr = umeq.feature_matrix(dataqr) # n x Jq assert fea_pr.shape[1] == self.V.shape[0] assert fea_qr.shape[1] == self.W.shape[0] # umehp = ume_hat(p, r) umehp, var_pr = tst.UMETest.ustat_h1_mean_variance( fea_pr, return_variance=True, use_unbiased=True) umehq, var_qr = tst.UMETest.ustat_h1_mean_variance( fea_qr, return_variance=True, use_unbiased=True) if var_pr <= 0: log.l().warning( 'Non-positive var_pr detected. Was {}'.format(var_pr)) if var_qr <= 0: log.l().warning( 'Non-positive var_qr detected. Was {}'.format(var_qr)) #assert var_pr > 0, 'var_pr was {}'.format(var_pr) #assert var_qr > 0, 'var_qr was {}'.format(var_qr) mean_h1 = umehp - umehq if not return_variance: return mean_h1 # mean features mean_pr = np.mean(fea_pr, axis=0) mean_qr = np.mean(fea_qr, axis=0) t1 = 4.0 * np.mean(np.dot(fea_pr, mean_pr) * np.dot(fea_qr, mean_qr)) t2 = 4.0 * np.sum(mean_pr**2) * np.sum(mean_qr**2) # compute the cross-covariance var_pqr = t1 - t2 var_h1 = var_pr - 2.0 * var_pqr + var_qr return mean_h1, var_h1
def perform_test( self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3, ): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results["mmdtest"] = mmdtest results["time_secs"] = t.secs return results
def TST_ME(Fea, N1, alpha, is_train, test_locs, gwidth, J=1, seed=15): """run ME test.""" Fea = get_item(Fea, is_cuda) tst_data = data.TSTData(Fea[0:N1, :], Fea[N1:, :]) h = 0 if is_train: op = { 'n_test_locs': J, # number of test locations to optimize 'max_iter': 300, # maximum number of gradient ascent iterations 'locs_step_size': 1.0, # step size for the test locations (features) 'gwidth_step_size': 0.1, # step size for the Gaussian width 'tol_fun': 1e-4, # stop if the objective does not increase more than this. 'seed': seed + 5, # random seed } test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width( tst_data, alpha, **op) return test_locs, gwidth else: met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) test_result = met_opt.perform_test(tst_data) if test_result['h0_rejected']: h = 1 return h
def perform_test(self, X, Y): import freqopttest.data as fdata ds_p = self.ds_p mmdtest = self.mmdtest seed = self.seed with util.ContextTimer() as t: # split the data X1, Y1, X2, Y2 = MMDSplitTest._split_half(X, Y, seed=self.seed + 330) # Draw sample from p Y2_ = ds_p.cond_pair_sample(X2, seed=seed + 13) real_data = torch.cat([X1, Y1], dim=1).numpy() model_data = torch.cat([X2, Y2_], dim=1).numpy() # Run the two-sample test on p_sample and dat # Make a two-sample test data tst_data = fdata.TSTData(real_data, model_data) # Test results = mmdtest.perform_test(tst_data) results['time_secs'] = t.secs return results
def compute_stat(self, dat): mmdtest = self.mmdtest p = self.p # Draw sample from p. #sample to draw is the same as that of dat ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=self.seed) # Make a two-sample test data tst_data = fdata.TSTData(p_sample.data(), dat.data()) s = mmdtest.compute_stat(tst_data) return s
def mmd(p, q, alpha=0.05): if (p.ndim == 1): p = p[:, np.newaxis] if (q.ndim == 1): q = q[:, np.newaxis] d = data.TSTData(p, q) d_tr, d_te = d.split_tr_te(tr_proportion=0.5) med = util.meddistance(d_tr.stack_xy()) widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 20)] list_kernels = [kernel.KGauss(w**2) for w in widths] besti, powers = tst.LinearMMDTest.grid_search_kernel( d_tr, list_kernels, alpha) best_ker = list_kernels[besti] lin_mmd_test = tst.LinearMMDTest(best_ker, alpha) r = lin_mmd_test.perform_test(d_te) return r['test_stat'], r['pvalue']
def TST_SCF(Fea, N1, alpha, is_train, test_freqs, gwidth, J = 1, seed = 15): """run SCF test.""" Fea = get_item(Fea,is_cuda) tst_data = data.TSTData(Fea[0:N1,:], Fea[N1:,:]) h = 0 if is_train: op = {'n_test_freqs': J, 'seed': seed, 'max_iter': 300, 'batch_proportion': 1.0, 'freqs_step_size': 0.1, 'gwidth_step_size': 0.01, 'tol_fun': 1e-4} test_freqs, gwidth, info = tst.SmoothCFTest.optimize_freqs_width(tst_data, alpha, **op) return test_freqs, gwidth else: scf_opt = tst.SmoothCFTest(test_freqs, gwidth, alpha=alpha) test_result = scf_opt.perform_test(tst_data) if test_result['h0_rejected']: h = 1 return h
def load_nips_TSTData(fname): if fname in cache_loaded: return cache_loaded[fname] fpath = glo.data_file(fname) with open(fpath, 'r') as f: loaded = pickle.load(f) X = loaded['P'] Y = loaded['Q'] n_min = min(X.shape[0], Y.shape[0]) X = X[:n_min, :] Y = Y[:n_min, :] assert (X.shape[0] == Y.shape[0]) tst_data = data.TSTData(X, Y) cache_loaded[fname] = (tst_data, n_min) return tst_data, n_min
def compute_stat(self, X, Y): """ X: Torch tensor of size n x dx Y: Torch tensor of size n x dy Return a test statistic """ import freqopttest.data as fdata seed = self.seed ds_p = self.ds_p mmdtest = self.mmdtest # Draw sample from p Y_ = ds_p.cond_pair_sample(X, seed=seed + 13) real_data = torch.cat([X, Y], dim=1).numpy() model_data = torch.cat([X, Y_], dim=1).numpy() # Make a two-sample test data tst_data = fdata.TSTData(real_data, model_data) stat = mmdtest.compute_stat(tst_data) return stat
def perform_test(self, dat): """ dat: an instance of Data """ with util.ContextTimer() as t: seed = self.seed mmdtest = self.mmdtest p = self.p # Draw sample from p. #sample to draw is the same as that of dat ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 12) # Run the two-sample test on p_sample and dat # Make a two-sample test data tst_data = fdata.TSTData(p_sample.data(), dat.data()) # Test results = mmdtest.perform_test(tst_data) results['time_secs'] = t.secs return results
def wtest(p, q, alpha=0.05): op = { 'n_test_locs': 2, 'seed': 0, 'max_iter': 200, 'batch_proportion': 1.0, 'locs_step_size': 1.0, 'gwidth_step_size': 0.1, 'tol_fun': 1e-4 } if (p.ndim == 1): p = p[:, np.newaxis] if (q.ndim == 1): q = q[:, np.newaxis] d = data.TSTData(p, q) d_tr, d_te = d.split_tr_te(tr_proportion=0.5) test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width( d_tr, alpha, **op) met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) r = met_opt.perform_test(d_te) if (r['test_stat'] == -1): r['test_stat'] = np.nan r['pvalue'] = np.nan return r['test_stat'], r['pvalue']
def compute_stat(self, X, Y): """ X: Torch tensor of size n x dx Y: Torch tensor of size n x dy Return a test statistic """ import freqopttest.data as fdata seed = self.seed ds_p = self.ds_p mmdtest = self.mmdtest # split the data X1, Y1, X2, Y2 = MMDSplitTest._split_half(X, Y, seed=self.seed + 330) # Draw sample from p Y2_ = ds_p.cond_pair_sample(X2, seed=seed + 13) real_data = torch.cat([X1, Y1], dim=1).numpy() model_data = torch.cat([X2, Y2_], dim=1).numpy() # Make a two-sample test data tst_data = fdata.TSTData(real_data, model_data) stat = mmdtest.compute_stat(tst_data) return stat
def preprocess(self, X, Y): if len(X.shape) > 2: X = X.reshape(len(X), -1) Y = Y.reshape(len(Y), -1) XY = fot_data.TSTData(X, Y) return XY
def optimize_2sets_locs_widths(datap, dataq, datar, V0, W0, gwidth0p, gwidth0q, reg=1e-3, max_iter=100, tol_fun=1e-6, disp=False, locs_bounds_frac=100, gwidth_lb=None, gwidth_ub=None): """ Optimize two sets of test locations and the Gaussian kernel widths by maximizing the test power criterion of the UME two-sample test (not three-sample test). Briefly, 1. Optimize the set V of test locations for UME(P, R) by maximizing its two-sample test power criterion. 2. Optimize the set W for UME(Q, R) in the same way. The two optimization problems are independent. The only dependency is the data from R. This optimization function is deterministic. - datap: a kgof.data.Data from P (model 1) - dataq: a kgof.data.Data from Q (model 2) - datar: a kgof.data.Data from R (data generating distribution) - V0: Jpxd numpy array. Initial V. - W0: Jqxd numpy array. Initial W. - gwidth0p: initial value of the Gaussian width^2 for UME(P, R) - gwidth0q: initial value of the Gaussian width^2 for UME(Q, R) - reg: reg to add to the mean/sqrt(variance) criterion to become mean/sqrt(variance + reg) - max_iter: #gradient descent iterations - tol_fun: termination tolerance of the objective value - disp: True to print convergence messages - locs_bounds_frac: When making box bounds for the test_locs, extend the box defined by coordinate-wise min-max by std of each coordinate (of the aggregated data) multiplied by this number. - gwidth_lb: absolute lower bound on both the Gaussian width^2 - gwidth_ub: absolute upper bound on both the Gaussian width^2 If the lb, ub bounds are None, use fraction of the median heuristics to automatically set the bounds. Return ( (V test_locs, gaussian width^2 for UME(P, R), optimization info log), (W test_locs, gaussian width^2 for UME(Q, R), optimization info log), ) """ Z = datar.data() datapr = tstdata.TSTData(datap.data(), Z) dataqr = tstdata.TSTData(dataq.data(), Z) # optimization for UME(P,R) V_opt, gw2p_opt, opt_infop = \ tst.GaussUMETest.optimize_locs_width(datapr, V0, gwidth0p, reg=reg, max_iter=max_iter, tol_fun=tol_fun, disp=disp, locs_bounds_frac=locs_bounds_frac, gwidth_lb=gwidth_lb, gwidth_ub=gwidth_ub) # optimization for UME(Q,R) W_opt, gw2q_opt, opt_infoq = \ tst.GaussUMETest.optimize_locs_width(dataqr, W0, gwidth0q, reg=reg, max_iter=max_iter, tol_fun=tol_fun, disp=disp, locs_bounds_frac=locs_bounds_frac, gwidth_lb=gwidth_lb, gwidth_ub=gwidth_ub) return ((V_opt, gw2p_opt, opt_infop), (W_opt, gw2q_opt, opt_infoq))