Beispiel #1
0
    def optimize_auto_init(p, dat, J, **ops):
        """
        Optimize parameters by calling optimize_locs_widths(). Automatically 
        initialize the test locations and the Gaussian width.

        Return optimized locations, Gaussian width, optimization info
        """
        assert J > 0
        # Use grid search to initialize the gwidth
        X = dat.data()
        n_gwidth_cand = 5
        gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand)
        med2 = util.meddistance(X, 1000)**2

        k = kernel.KGauss(med2 * 2)
        # fit a Gaussian to the data and draw to initialize V0
        V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6)
        list_gwidth = np.hstack(((med2) * gwidth_factors))
        besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth)
        gwidth = list_gwidth[besti]
        assert util.is_real_num(
            gwidth), 'gwidth not real. Was %s' % str(gwidth)
        assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth
        logging.info('After grid search, gwidth=%.3g' % gwidth)

        V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths(
            p, dat, gwidth, V0, **ops)

        # set the width bounds
        #fac_min = 5e-2
        #fac_max = 5e3
        #gwidth_lb = fac_min*med2
        #gwidth_ub = fac_max*med2
        #gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub))
        return V_opt, gwidth_opt, info
Beispiel #2
0
    def grid_search_gwidth(p, dat, test_locs, list_gwidth):
        """
        Linear search for the best Gaussian width in the list that maximizes 
        the test power criterion, fixing the test locations. 

        - V: a J x dx np-array for J test locations 

        return: (best width index, list of test power objectives)
        """
        list_gauss_kernel = [kernel.KGauss(gw) for gw in list_gwidth]
        besti, objs = FSSD.fssd_grid_search_kernel(p, dat, test_locs,
                                                   list_gauss_kernel)
        return besti, objs
Beispiel #3
0
    def perform_test(self,
                     dat,
                     candidate_kernels=None,
                     return_mmdtest=False,
                     tr_proportion=0.2,
                     reg=1e-3):
        """
        dat: an instance of Data
        candidate_kernels: a list of Kernel's to choose from
        tr_proportion: proportion of sample to be used to choosing the best
            kernel
        reg: regularization parameter for the test power criterion 
        """
        with util.ContextTimer() as t:
            seed = self.seed
            p = self.p
            ds = p.get_datasource()
            p_sample = ds.sample(dat.sample_size(), seed=seed + 77)
            xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion,
                                            seed=seed + 18)
            # ytr, yte are of type data.Data
            ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion,
                                       seed=seed + 12)

            # training and test data
            tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
            te_tst_data = fdata.TSTData(xte.data(), yte.data())

            if candidate_kernels is None:
                # Assume a Gaussian kernel. Construct a list of
                # kernels to try based on multiples of the median heuristic
                med = util.meddistance(tr_tst_data.stack_xy(), 1000)
                list_gwidth = np.hstack(
                    ((med**2) * (2.0**np.linspace(-4, 4, 10))))
                list_gwidth.sort()
                candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

            alpha = self.alpha

            # grid search to choose the best Gaussian width
            besti, powers = tst.QuadMMDTest.grid_search_kernel(
                tr_tst_data, candidate_kernels, alpha, reg=reg)
            # perform test
            best_ker = candidate_kernels[besti]
            mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha)
            results = mmdtest.perform_test(te_tst_data)
            if return_mmdtest:
                results['mmdtest'] = mmdtest

        results['time_secs'] = t.secs
        return results
Beispiel #4
0
 def power_criterion(p, dat, gwidth, test_locs, reg=1e-2, use_2terms=False):
     """
     use_2terms: True if the objective should include the first term in the power 
         expression. This term carries the test threshold and is difficult to 
         compute (depends on the optimized test locations). If True, then 
         the objective will be -1/(n**0.5*sigma_H1) + n**0.5 FSSD^2/sigma_H1, 
         which ignores the test threshold in the first term.
     """
     k = kernel.KGauss(gwidth)
     return FSSD.power_criterion(p,
                                 dat,
                                 k,
                                 test_locs,
                                 reg,
                                 use_2terms=use_2terms)
Beispiel #5
0
 def __init__(self, p, sigma2, V, alpha=0.01, n_simulate=3000, seed=10):
     k = kernel.KGauss(sigma2)
     null_sim = FSSDH0SimCovObs(n_simulate=n_simulate, seed=seed)
     super(GaussFSSD, self).__init__(p, k, V, null_sim, alpha)
Beispiel #6
0
p = density.IsotropicNormal(mean, variance)
q_mean = mean.copy()
q_variance = variance
# q_mean[0] = 1

ds = data.DSIsotropicNormal(q_mean + 1, q_variance)
# q_means = np.array([ [0], [0]])
# q_variances = np.array([0.01, 1])
# ds = data.DSIsoGaussianMixture(q_means, q_variances, pmix=[0.2, 0.8])

# Test
dat = ds.sample(n, seed=seed + 2)
X = dat.data()
# Use median heuristic to determine the Gaussian kernel width
sig2 = util.meddistance(X, subsample=1000)**2
k = ker.KGauss(sig2)

mmd_test = mgof.QuadMMDGof(p, k, n_permute=300, alpha=alpha, seed=seed)
mmd_result = mmd_test.perform_test(dat)
mmd_result

print('Reject H0?: {0}'.format(mmd_result['h0_rejected']))

sim_stats = mmd_result['list_permuted_mmd2']
stat = mmd_result['test_stat']
unif_weights = np.ones_like(sim_stats) / float(len(sim_stats))
plt.hist(sim_stats, label='Simulated', weights=unif_weights)
plt.plot([stat, stat], [0, 0], 'r*', markersize=30, label='Stat')
plt.legend(loc='best')