def optimize_auto_init(p, dat, J, **ops): """ Optimize parameters by calling optimize_locs_widths(). Automatically initialize the test locations and the Gaussian width. Return optimized locations, Gaussian width, optimization info """ assert J > 0 # Use grid search to initialize the gwidth X = dat.data() n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(X, 1000)**2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), 'gwidth not real. Was %s' % str(gwidth) assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth logging.info('After grid search, gwidth=%.3g' % gwidth) V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths( p, dat, gwidth, V0, **ops) # set the width bounds #fac_min = 5e-2 #fac_max = 5e3 #gwidth_lb = fac_min*med2 #gwidth_ub = fac_max*med2 #gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub)) return V_opt, gwidth_opt, info
def grid_search_gwidth(p, dat, test_locs, list_gwidth): """ Linear search for the best Gaussian width in the list that maximizes the test power criterion, fixing the test locations. - V: a J x dx np-array for J test locations return: (best width index, list of test power objectives) """ list_gauss_kernel = [kernel.KGauss(gw) for gw in list_gwidth] besti, objs = FSSD.fssd_grid_search_kernel(p, dat, test_locs, list_gauss_kernel) return besti, objs
def perform_test(self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results['mmdtest'] = mmdtest results['time_secs'] = t.secs return results
def power_criterion(p, dat, gwidth, test_locs, reg=1e-2, use_2terms=False): """ use_2terms: True if the objective should include the first term in the power expression. This term carries the test threshold and is difficult to compute (depends on the optimized test locations). If True, then the objective will be -1/(n**0.5*sigma_H1) + n**0.5 FSSD^2/sigma_H1, which ignores the test threshold in the first term. """ k = kernel.KGauss(gwidth) return FSSD.power_criterion(p, dat, k, test_locs, reg, use_2terms=use_2terms)
def __init__(self, p, sigma2, V, alpha=0.01, n_simulate=3000, seed=10): k = kernel.KGauss(sigma2) null_sim = FSSDH0SimCovObs(n_simulate=n_simulate, seed=seed) super(GaussFSSD, self).__init__(p, k, V, null_sim, alpha)
p = density.IsotropicNormal(mean, variance) q_mean = mean.copy() q_variance = variance # q_mean[0] = 1 ds = data.DSIsotropicNormal(q_mean + 1, q_variance) # q_means = np.array([ [0], [0]]) # q_variances = np.array([0.01, 1]) # ds = data.DSIsoGaussianMixture(q_means, q_variances, pmix=[0.2, 0.8]) # Test dat = ds.sample(n, seed=seed + 2) X = dat.data() # Use median heuristic to determine the Gaussian kernel width sig2 = util.meddistance(X, subsample=1000)**2 k = ker.KGauss(sig2) mmd_test = mgof.QuadMMDGof(p, k, n_permute=300, alpha=alpha, seed=seed) mmd_result = mmd_test.perform_test(dat) mmd_result print('Reject H0?: {0}'.format(mmd_result['h0_rejected'])) sim_stats = mmd_result['list_permuted_mmd2'] stat = mmd_result['test_stat'] unif_weights = np.ones_like(sim_stats) / float(len(sim_stats)) plt.hist(sim_stats, label='Simulated', weights=unif_weights) plt.plot([stat, stat], [0, 0], 'r*', markersize=30, label='Stat') plt.legend(loc='best')