def perform_test(self, dat, return_simulated_stats=False): """ dat: an instance of Data """ with util.ContextTimer() as t: alpha = self.alpha null_sim = self.null_sim n_simulate = null_sim.n_simulate X = dat.data() n = X.shape[0] J = self.V.shape[0] nfssd, fea_tensor = self.compute_stat(dat, return_feature_tensor=True) sim_results = null_sim.simulate(self, dat, fea_tensor) arr_nfssd = sim_results['sim_stats'] # approximate p-value with the permutations pvalue = np.mean(arr_nfssd > nfssd) results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': nfssd, 'h0_rejected': pvalue < alpha, 'n_simulate': n_simulate, 'time_secs': t.secs, } if return_simulated_stats: results['sim_stats'] = arr_nfssd return results
def perform_test(self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results['mmdtest'] = mmdtest results['time_secs'] = t.secs return results
def perform_test(self, dat, return_simulated_stats=False, return_ustat_gram=False): """ dat: a instance of Data """ with util.ContextTimer() as t: alpha = self.alpha n_simulate = self.n_simulate X = dat.data() n = X.shape[0] _, H = self.compute_stat(dat, return_ustat_gram=True) test_stat = n * np.mean(H) # bootrapping sim_stats = np.zeros(n_simulate) with util.NumpySeedContext(seed=self.seed): for i in range(n_simulate): W = self.bootstrapper(n) # n * [ (1/n^2) * \sum_i \sum_j h(x_i, x_j) w_i w_j ] boot_stat = W.dot(H.dot(old_div(W, float(n)))) # This is a bootstrap version of n*V_n sim_stats[i] = boot_stat # approximate p-value with the permutations pvalue = np.mean(sim_stats > test_stat) results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat, 'h0_rejected': pvalue < alpha, 'n_simulate': n_simulate, 'time_secs': t.secs, } if return_simulated_stats: results['sim_stats'] = sim_stats if return_ustat_gram: results['H'] = H return results
def perform_test(self, dat): """ dat: an instance of Data """ with util.ContextTimer() as t: seed = self.seed mmdtest = self.mmdtest p = self.p # Draw sample from p. #sample to draw is the same as that of dat ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 12) # Run the two-sample test on p_sample and dat # Make a two-sample test data tst_data = fdata.TSTData(p_sample.data(), dat.data()) # Test results = mmdtest.perform_test(tst_data) results['time_secs'] = t.secs return results
def perform_test(self, dat): """ dat: a instance of Data """ with util.ContextTimer() as t: alpha = self.alpha X = dat.data() n = X.shape[0] # H: length-n vector _, H = self.compute_stat(dat, return_pointwise_stats=True) test_stat = np.sqrt(old_div(n, 2)) * np.mean(H) stat_var = np.mean(H**2) pvalue = stats.norm.sf(test_stat, loc=0, scale=np.sqrt(stat_var)) results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat, 'h0_rejected': pvalue < alpha, 'time_secs': t.secs, } return results
def optimize_locs_params( p, dat, b0, c0, test_locs0, reg=1e-2, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100, b_lb=-20.0, b_ub=-1e-4, c_lb=1e-6, c_ub=1e3, ): """ Optimize the test locations and the the two parameters (b and c) of the IMQ kernel by maximizing the test power criterion. k(x,y) = (c^2 + ||x-y||^2)^b where c > 0 and b < 0. data should not be the same data as used in the actual test (i.e., should be a held-out set). This function is deterministic. - p: UnnormalizedDensity specifying the problem. - b0: initial parameter value for b (in the kernel) - c0: initial parameter value for c (in the kernel) - dat: a Data object (training set) - test_locs0: Jxd numpy array. Initial V. - reg: reg to add to the mean/sqrt(variance) criterion to become mean/sqrt(variance + reg) - max_iter: #gradient descent iterations - tol_fun: termination tolerance of the objective value - disp: True to print convergence messages - locs_bounds_frac: When making box bounds for the test_locs, extend the box defined by coordinate-wise min-max by std of each coordinate multiplied by this number. - b_lb: absolute lower bound on b. b is always < 0. - b_ub: absolute upper bound on b - c_lb: absolute lower bound on c. c is always > 0. - c_ub: absolute upper bound on c #- If the lb, ub bounds are None Return (V test_locs, b, c, optimization info log) """ """ In the optimization, we will parameterize b with its square root. Square back and negate to form b. c is not parameterized in any special way since it enters to the kernel with c^2. Absolute value of c will be taken to make sure it is positive. """ J = test_locs0.shape[0] X = dat.data() n, d = X.shape def obj(sqrt_neg_b, c, V): b = -sqrt_neg_b**2 return -IMQFSSD.power_criterion(p, dat, b, c, V, reg=reg) flatten = lambda sqrt_neg_b, c, V: np.hstack( (sqrt_neg_b, c, V.reshape(-1))) def unflatten(x): sqrt_neg_b = x[0] c = x[1] V = np.reshape(x[2:], (J, d)) return sqrt_neg_b, c, V def flat_obj(x): sqrt_neg_b, c, V = unflatten(x) return obj(sqrt_neg_b, c, V) # gradient #grad_obj = autograd.elementwise_grad(flat_obj) # Initial point b02 = np.sqrt(-b0) x0 = flatten(b02, c0, test_locs0) # Make a box to bound test locations X_std = np.std(X, axis=0) # X_min: length-d array X_min = np.min(X, axis=0) X_max = np.max(X, axis=0) # V_lb: J x d V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1)) V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1)) # (J*d+2) x 2. Make sure to bound the reparamterized values (not the original) """ For b, b2 := sqrt(-b) lb <= b <= ub < 0 means sqrt(-ub) <= b2 <= sqrt(-lb) Note the positions of ub, lb. """ x0_lb = np.hstack((np.sqrt(-b_ub), c_lb, np.reshape(V_lb, -1))) x0_ub = np.hstack((np.sqrt(-b_lb), c_ub, np.reshape(V_ub, -1))) x0_bounds = list(zip(x0_lb, x0_ub)) # optimize. Time the optimization as well. # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html grad_obj = autograd.elementwise_grad(flat_obj) with util.ContextTimer() as timer: opt_result = scipy.optimize.minimize( flat_obj, x0, method='L-BFGS-B', bounds=x0_bounds, tol=tol_fun, options={ 'maxiter': max_iter, 'ftol': tol_fun, 'disp': disp, 'gtol': 1.0e-06, }, jac=grad_obj, ) opt_result = dict(opt_result) opt_result['time_secs'] = timer.secs x_opt = opt_result['x'] sqrt_neg_b, c, V_opt = unflatten(x_opt) b = -sqrt_neg_b**2 assert util.is_real_num(b), 'b is not real. Was {}'.format(b) assert b < 0 assert util.is_real_num(c), 'c is not real. Was {}'.format(c) assert c > 0 return V_opt, b, c, opt_result
def optimize_locs(p, dat, b, c, test_locs0, reg=1e-5, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100): """ Optimize just the test locations by maximizing a test power criterion, keeping the kernel parameters b, c fixed to the specified values. data should not be the same data as used in the actual test (i.e., should be a held-out set). This function is deterministic. - p: an UnnormalizedDensity specifying the problem - dat: a Data object - b, c: kernel parameters of the IMQ kernel. Not optimized. - test_locs0: Jxd numpy array. Initial V. - reg: reg to add to the mean/sqrt(variance) criterion to become mean/sqrt(variance + reg) - max_iter: #gradient descent iterations - tol_fun: termination tolerance of the objective value - disp: True to print convergence messages - locs_bounds_frac: When making box bounds for the test_locs, extend the box defined by coordinate-wise min-max by std of each coordinate multiplied by this number. Return (V test_locs, optimization info log) """ J = test_locs0.shape[0] X = dat.data() n, d = X.shape def obj(V): return -IMQFSSD.power_criterion(p, dat, b, c, V, reg=reg) flatten = lambda V: np.reshape(V, -1) def unflatten(x): V = np.reshape(x, (J, d)) return V def flat_obj(x): V = unflatten(x) return obj(V) # Initial point x0 = flatten(test_locs0) # Make a box to bound test locations X_std = np.std(X, axis=0) # X_min: length-d array X_min = np.min(X, axis=0) X_max = np.max(X, axis=0) # V_lb: J x d V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1)) V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1)) # (J*d) x 2. x0_bounds = list( zip( V_lb.reshape(-1)[:, np.newaxis], V_ub.reshape(-1)[:, np.newaxis])) # optimize. Time the optimization as well. # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html grad_obj = autograd.elementwise_grad(flat_obj) with util.ContextTimer() as timer: opt_result = scipy.optimize.minimize( flat_obj, x0, method='L-BFGS-B', bounds=x0_bounds, tol=tol_fun, options={ 'maxiter': max_iter, 'ftol': tol_fun, 'disp': disp, 'gtol': 1.0e-06, }, jac=grad_obj, ) opt_result = dict(opt_result) opt_result['time_secs'] = timer.secs x_opt = opt_result['x'] V_opt = unflatten(x_opt) return V_opt, opt_result
def optimize_locs_widths( p, dat, gwidth0, test_locs0, reg=1e-2, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100, gwidth_lb=None, gwidth_ub=None, use_2terms=False, ): """ Optimize the test locations and the Gaussian kernel width by maximizing a test power criterion. data should not be the same data as used in the actual test (i.e., should be a held-out set). This function is deterministic. - data: a Data object - test_locs0: Jxd numpy array. Initial V. - reg: reg to add to the mean/sqrt(variance) criterion to become mean/sqrt(variance + reg) - gwidth0: initial value of the Gaussian width^2 - max_iter: #gradient descent iterations - tol_fun: termination tolerance of the objective value - disp: True to print convergence messages - locs_bounds_frac: When making box bounds for the test_locs, extend the box defined by coordinate-wise min-max by std of each coordinate multiplied by this number. - gwidth_lb: absolute lower bound on the Gaussian width^2 - gwidth_ub: absolute upper bound on the Gaussian width^2 - use_2terms: If True, then besides the signal-to-noise ratio criterion, the objective function will also include the first term that is dropped. #- If the lb, ub bounds are None, use fraction of the median heuristics # to automatically set the bounds. Return (V test_locs, gaussian width, optimization info log) """ J = test_locs0.shape[0] X = dat.data() n, d = X.shape # Parameterize the Gaussian width with its square root (then square later) # to automatically enforce the positivity. def obj(sqrt_gwidth, V): return -GaussFSSD.power_criterion( p, dat, sqrt_gwidth**2, V, reg=reg, use_2terms=use_2terms) flatten = lambda gwidth, V: np.hstack((gwidth, V.reshape(-1))) def unflatten(x): sqrt_gwidth = x[0] V = np.reshape(x[1:], (J, d)) return sqrt_gwidth, V def flat_obj(x): sqrt_gwidth, V = unflatten(x) return obj(sqrt_gwidth, V) # gradient #grad_obj = autograd.elementwise_grad(flat_obj) # Initial point x0 = flatten(np.sqrt(gwidth0), test_locs0) #make sure that the optimized gwidth is not too small or too large. fac_min = 1e-2 fac_max = 1e2 med2 = util.meddistance(X, subsample=1000)**2 if gwidth_lb is None: gwidth_lb = max(fac_min * med2, 1e-3) if gwidth_ub is None: gwidth_ub = min(fac_max * med2, 1e5) # Make a box to bound test locations X_std = np.std(X, axis=0) # X_min: length-d array X_min = np.min(X, axis=0) X_max = np.max(X, axis=0) # V_lb: J x d V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1)) V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1)) # (J*d+1) x 2. Take square root because we parameterize with the square # root x0_lb = np.hstack((np.sqrt(gwidth_lb), np.reshape(V_lb, -1))) x0_ub = np.hstack((np.sqrt(gwidth_ub), np.reshape(V_ub, -1))) x0_bounds = list(zip(x0_lb, x0_ub)) # optimize. Time the optimization as well. # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html grad_obj = autograd.elementwise_grad(flat_obj) with util.ContextTimer() as timer: opt_result = scipy.optimize.minimize( flat_obj, x0, method='L-BFGS-B', bounds=x0_bounds, tol=tol_fun, options={ 'maxiter': max_iter, 'ftol': tol_fun, 'disp': disp, 'gtol': 1.0e-07, }, jac=grad_obj, ) opt_result = dict(opt_result) opt_result['time_secs'] = timer.secs x_opt = opt_result['x'] sq_gw_opt, V_opt = unflatten(x_opt) gw_opt = sq_gw_opt**2 assert util.is_real_num( gw_opt), 'gw_opt is not real. Was %s' % str(gw_opt) return V_opt, gw_opt, opt_result