Ejemplo n.º 1
0
def job_mmd_opt(p, data_source, tr, te, r):
    """
    MMD test of Gretton et al., 2012 used as a goodness-of-fit test.
    Require the ability to sample from p i.e., the UnnormalizedDensity p has 
    to be able to return a non-None from get_datasource()

    With optimization. Gaussian kernel.
    """
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic 
        pds = p.get_datasource()
        datY = pds.sample(data.sample_size(), seed=r+294)
        Y = datY.data()
        XY = np.vstack((X, Y))

        med = util.meddistance(XY, subsample=1000)

        # Construct a list of kernels to try based on multiples of the median
        # heuristic
        #list_gwidth = np.hstack( (np.linspace(20, 40, 10), (med**2)
        #    *(2.0**np.linspace(-2, 2, 20) ) ) )
        list_gwidth = (med**2)*(2.0**np.linspace(-4, 4, 30) ) 
        list_gwidth.sort()
        candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

        mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r)
        mmd_result = mmd_opt.perform_test(data,
                candidate_kernels=candidate_kernels,
                tr_proportion=tr_proportion, reg=1e-3)
    return { 'test_result': mmd_result, 'time_secs': t.secs}
Ejemplo n.º 2
0
    def optimize_auto_init(p, dat, J, **ops):
        """
        Optimize parameters by calling optimize_locs_widths(). Automatically 
        initialize the test locations and the Gaussian width.

        Return optimized locations, Gaussian width, optimization info
        """
        assert J > 0
        # Use grid search to initialize the gwidth
        X = dat.data()
        n_gwidth_cand = 5
        gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand)
        med2 = util.meddistance(X, 1000)**2

        k = kernel.KGauss(med2 * 2)
        # fit a Gaussian to the data and draw to initialize V0
        V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6)
        list_gwidth = np.hstack(((med2) * gwidth_factors))
        besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth)
        gwidth = list_gwidth[besti]
        assert util.is_real_num(
            gwidth), 'gwidth not real. Was %s' % str(gwidth)
        assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth
        logging.info('After grid search, gwidth=%.3g' % gwidth)

        V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths(
            p, dat, gwidth, V0, **ops)

        # set the width bounds
        #fac_min = 5e-2
        #fac_max = 5e3
        #gwidth_lb = fac_min*med2
        #gwidth_ub = fac_max*med2
        #gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub))
        return V_opt, gwidth_opt, info
Ejemplo n.º 3
0
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None):
    """
    FSSD test with a Gaussian kernel, where the test locations are randomized,
    and the Gaussian width is set with the median heuristic. Use full sample.
    No training/testing splits.

    p: an UnnormalizedDensity
    data_source: a DataSource
    tr, te: Data
    r: trial number (positive integer)
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        med = util.meddistance(X, subsample=1000)
        k = kernel.KGauss(med**2)
        V = util.fit_gaussian_draw(X, J, seed=r + 3)

        fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha)
        fssd_med_result = fssd_med.perform_test(data)
    return {
        'goftest': fssd_med,
        'test_result': fssd_med_result,
        'time_secs': t.secs
    }
Ejemplo n.º 4
0
def job_mmd_med(p, data_source, tr, te, r):
    """
    MMD test of Gretton et al., 2012 used as a goodness-of-fit test.
    Require the ability to sample from p i.e., the UnnormalizedDensity p has 
    to be able to return a non-None from get_datasource()
    """
    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        pds = p.get_datasource()
        datY = pds.sample(data.sample_size(), seed=r + 294)
        Y = datY.data()
        XY = np.vstack((X, Y))

        # If p, q differ very little, the median may be very small, rejecting H0
        # when it should not?
        medx = util.meddistance(X, subsample=1000)
        medy = util.meddistance(Y, subsample=1000)
        medxy = util.meddistance(XY, subsample=1000)
        med_avg = (medx + medy + medxy) / 3.0
        k = kernel.KGauss(med_avg**2)

        mmd_test = mgof.QuadMMDGof(p, k, n_permute=400, alpha=alpha, seed=r)
        mmd_result = mmd_test.perform_test(data)
    return {'test_result': mmd_result, 'time_secs': t.secs}
Ejemplo n.º 5
0
    def test_ustat_h1_mean_variance(self):
        seed = 20
        # sample
        n = 200
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            isonorm = density.IsotropicNormal(mean, variance)

            draw_mean = mean + 2
            draw_variance = variance + 1
            X = util.randn(n, d,
                           seed=seed) * np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed + 1)

                null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
                fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)
                fea_tensor = fssd.feature_tensor(X)

                u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance(
                    fea_tensor)

                # assertions
                self.assertGreaterEqual(u_variance, 0)
                # should reject H0
                self.assertGreaterEqual(u_mean, 0)
Ejemplo n.º 6
0
    def numpy(self):
        """
        Return an instance of numpy implementation of itself. 
        """
        sigma2 = self.sigma2.item()

        return gofker.KGauss(sigma2)
Ejemplo n.º 7
0
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None):
    """
    FSSD with optimization on tr. Test on te. Use a Gaussian kernel.
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    Xtr = tr.data()
    with util.ContextTimer() as t:
        # Use grid search to initialize the gwidth
        n_gwidth_cand = 5
        gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand)
        med2 = util.meddistance(Xtr, 1000)**2

        k = kernel.KGauss(med2 * 2)
        # fit a Gaussian to the data and draw to initialize V0
        V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6)
        list_gwidth = np.hstack(((med2) * gwidth_factors))
        besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth)
        gwidth = list_gwidth[besti]
        assert util.is_real_num(
            gwidth), 'gwidth not real. Was %s' % str(gwidth)
        assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth
        logging.info('After grid search, gwidth=%.3g' % gwidth)

        ops = {
            'reg': 1e-2,
            'max_iter': 40,
            'tol_fun': 1e-4,
            'disp': True,
            'locs_bounds_frac': 10.0,
            'gwidth_lb': 1e-1,
            'gwidth_ub': 1e4,
        }

        V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths(
            p, tr, gwidth, V0, **ops)
        # Use the optimized parameters to construct a test
        k_opt = kernel.KGauss(gwidth_opt)
        fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
        fssd_opt_result = fssd_opt.perform_test(te)
    return {
        'test_result': fssd_opt_result,
        'time_secs': t.secs,
        'goftest': fssd_opt,
        'opt_info': info,
    }
Ejemplo n.º 8
0
    def perform_test(self,
                     dat,
                     model_sample,
                     candidate_kernels=None,
                     return_mmdtest=False,
                     tr_proportion=0.2,
                     reg=1e-3):
        """
        dat: an instance of Data
        candidate_kernels: a list of Kernel's to choose from
        tr_proportion: proportion of sample to be used to choosing the best
            kernel
        reg: regularization parameter for the test power criterion
        """
        with util.ContextTimer() as t:
            seed = self.seed
            p = self.p
            #ds = p.get_datasource()
            #p_sample = ds.sample(dat.sample_size(), seed=seed+77)
            p_sample = model_sample
            xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion,
                                            seed=seed + 18)
            # ytr, yte are of type data.Data
            ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion,
                                       seed=seed + 12)

            # training and test data
            tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
            te_tst_data = fdata.TSTData(xte.data(), yte.data())

            if candidate_kernels is None:
                # Assume a Gaussian kernel. Construct a list of
                # kernels to try based on multiples of the median heuristic
                med = util.meddistance(tr_tst_data.stack_xy(), 1000)
                list_gwidth = np.hstack(
                    ((med**2) * (2.0**np.linspace(-4, 4, 10))))
                list_gwidth.sort()
                candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

            alpha = self.alpha

            # grid search to choose the best Gaussian width
            besti, powers = tst.QuadMMDTest.grid_search_kernel(
                tr_tst_data, candidate_kernels, alpha, reg=reg)
            # perform test
            best_ker = candidate_kernels[besti]
            mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha)
            results = mmdtest.perform_test(te_tst_data)
            if return_mmdtest:
                results['mmdtest'] = mmdtest

        results['time_secs'] = t.secs
        return results
Ejemplo n.º 9
0
    def grid_search_gwidth(p, dat, test_locs, list_gwidth):
        """
        Linear search for the best Gaussian width in the list that maximizes 
        the test power criterion, fixing the test locations. 

        - V: a J x dx np-array for J test locations 

        return: (best width index, list of test power objectives)
        """
        list_gauss_kernel = [kernel.KGauss(gw) for gw in list_gwidth]
        besti, objs = FSSD.fssd_grid_search_kernel(p, dat, test_locs,
                                                   list_gauss_kernel)
        return besti, objs
Ejemplo n.º 10
0
    def test_basic(self):
        """
        Nothing special. Just test basic things.
        """
        # sample
        n = 10
        d = 3
        with util.NumpySeedContext(seed=29):
            X = np.random.randn(n, d) * 3
            k = kernel.KGauss(sigma2=1)
            K = k.eval(X, X)

            self.assertEqual(K.shape, (n, n))
            self.assertTrue(np.all(K >= 0 - 1e-6))
            self.assertTrue(np.all(K <= 1 + 1e-6), 'K not bounded by 1')
Ejemplo n.º 11
0
 def power_criterion(p, dat, gwidth, test_locs, reg=1e-2, use_2terms=False):
     """
     use_2terms: True if the objective should include the first term in the power 
         expression. This term carries the test threshold and is difficult to 
         compute (depends on the optimized test locations). If True, then 
         the objective will be -1/(n**0.5*sigma_H1) + n**0.5 FSSD^2/sigma_H1, 
         which ignores the test threshold in the first term.
     """
     k = kernel.KGauss(gwidth)
     return FSSD.power_criterion(p,
                                 dat,
                                 k,
                                 test_locs,
                                 reg,
                                 use_2terms=use_2terms)
Ejemplo n.º 12
0
    def test_pair_gradX_Y(self):
        # sample
        n = 11
        d = 3
        with util.NumpySeedContext(seed=20):
            X = np.random.randn(n, d) * 4
            Y = np.random.randn(n, d) * 2
            k = kernel.KGauss(sigma2=2.1)
            # n x d
            pair_grad = k.pair_gradX_Y(X, Y)
            loop_grad = np.zeros((n, d))
            for i in range(n):
                for j in range(d):
                    loop_grad[i, j] = k.gradX_Y(X[[i], :], Y[[i], :], j)

            testing.assert_almost_equal(pair_grad, loop_grad)
Ejemplo n.º 13
0
def job_lin_kstein_med(p, data_source, tr, te, r):
    """
    Linear-time version of the kernel Stein discrepancy test of Liu et al.,
    2016 and Chwialkowski et al., 2016. Use full sample.
    """
    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        med = util.meddistance(X, subsample=1000)
        k = kernel.KGauss(med**2)

        lin_kstein = gof.LinearKernelSteinTest(p, k, alpha=alpha, seed=r)
        lin_kstein_result = lin_kstein.perform_test(data)
    return {'test_result': lin_kstein_result, 'time_secs': t.secs}
Ejemplo n.º 14
0
        def job_kstein_med(p, data_source, tr, te, r):
            """
            Kernel Stein discrepancy test of Liu et al., 2016 and Chwialkowski et al.,
            2016. Use full sample. Use Gaussian kernel.
            """
            # full data
            data = tr + te
            X = data.data()
            with util.ContextTimer() as t:
                # median heuristic
                med = util.meddistance(X, subsample=1000)
                k = kernel.KGauss(med ** 2)

                kstein = gof.KernelSteinTest(p, k, alpha=args.alpha, n_simulate=1000, seed=r)
                kstein_result = kstein.perform_test(data)
            return {'test_result': kstein_result, 'time_secs': t.secs}
Ejemplo n.º 15
0
    def test_gradX_y(self):
        n = 10
        with util.NumpySeedContext(seed=10):
            for d in [1, 3]:
                y = np.random.randn(d) * 2
                X = np.random.rand(n, d) * 3

                sigma2 = 1.3
                k = kernel.KGauss(sigma2=sigma2)
                # n x d
                G = k.gradX_y(X, y)
                # check correctness
                K = k.eval(X, y[np.newaxis, :])
                myG = -K / sigma2 * (X - y)

                self.assertEqual(G.shape, myG.shape)
                testing.assert_almost_equal(G, myG)
Ejemplo n.º 16
0
    def test_basic(self):
        d = 3
        p = density.IsotropicNormal(mean=np.zeros(d), variance=3.0)
        q = density.IsotropicNormal(mean=np.zeros(d) + 2, variance=3.0)
        k = kernel.KGauss(2.0)

        ds = q.get_datasource()
        n = 97
        dat = ds.sample(n, seed=3)

        witness = gof.SteinWitness(p, k, dat)
        # points to evaluate the witness
        J = 4
        V = np.random.randn(J, d) * 2
        evals = witness(V)

        testing.assert_equal(evals.shape, (J, d))
Ejemplo n.º 17
0
    def test_optimized_fssd(self):
        """
        Test FSSD test with parameter optimization.
        """
        seed = 4
        # sample size
        n = 179
        alpha = 0.01
        for d in [1, 3]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 4]:
                opts = {
                    'reg': 1e-2,
                    'max_iter': 10,
                    'tol_fun': 1e-3,
                    'disp': False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                Xtr = tr.X
                gwidth0 = util.meddistance(Xtr, subsample=1000)**2
                # random test locations
                V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1)
                V_opt, gw_opt, opt_result = \
                gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result['h0_rejected']
Ejemplo n.º 18
0
    def test_gradXY_sum(self):
        n = 11
        with util.NumpySeedContext(seed=12):
            for d in [3, 1]:
                X = np.random.randn(n, d)
                sigma2 = 1.4
                k = kernel.KGauss(sigma2=sigma2)

                # n x n
                myG = np.zeros((n, n))
                K = k.eval(X, X)
                for i in range(n):
                    for j in range(n):
                        diffi2 = np.sum((X[i, :] - X[j, :])**2)
                        #myG[i, j] = -diffi2*K[i, j]/(sigma2**2)+ d*K[i, j]/sigma2
                        myG[i, j] = K[i, j] / sigma2 * (d - diffi2 / sigma2)

                # check correctness
                G = k.gradXY_sum(X, X)

                self.assertEqual(G.shape, myG.shape)
                testing.assert_almost_equal(G, myG)
Ejemplo n.º 19
0
    def test_auto_init_opt_fssd(self):
        """
        Test FSSD-opt test with automatic parameter initialization.
        """
        seed = 5
        # sample size
        n = 191
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 3]:
                opts = {
                    'reg': 1e-2,
                    'max_iter': 10,
                    'tol_fun': 1e-3,
                    'disp': False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                V_opt, gw_opt, opt_result = \
                gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result['h0_rejected']
Ejemplo n.º 20
0
    def test_basic(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 12
        # sample
        n = 100
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            isonorm = density.IsotropicNormal(mean, variance)

            # only one dimension of the mean is shifted
            #draw_mean = mean + np.hstack((1, np.zeros(d-1)))
            draw_mean = mean + 0
            draw_variance = variance + 1
            X = util.randn(n, d,
                           seed=seed) * np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed + 1)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
                fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)

                tresult = fssd.perform_test(dat, return_simulated_stats=True)

                # assertions
                self.assertGreaterEqual(tresult['pvalue'], 0)
                self.assertLessEqual(tresult['pvalue'], 1)
Ejemplo n.º 21
0
 def __init__(self, p, sigma2, V, alpha=0.01, n_simulate=3000, seed=10):
     k = kernel.KGauss(sigma2)
     null_sim = FSSDH0SimCovObs(n_simulate=n_simulate, seed=seed)
     super(GaussFSSD, self).__init__(p, k, V, null_sim, alpha)