Example #1
0
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None):
    """
    FSSD test with a Gaussian kernel, where the test locations are randomized,
    and the Gaussian width is set with the median heuristic. Use full sample.
    No training/testing splits.

    p: an UnnormalizedDensity
    data_source: a DataSource
    tr, te: Data
    r: trial number (positive integer)
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        med = util.meddistance(X, subsample=1000)
        k = kernel.KGauss(med**2)
        V = util.fit_gaussian_draw(X, J, seed=r + 3)

        fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha)
        fssd_med_result = fssd_med.perform_test(data)
    return {
        'goftest': fssd_med,
        'test_result': fssd_med_result,
        'time_secs': t.secs
    }
Example #2
0
def job_mmd_med(p, data_source, tr, te, r):
    """
    MMD test of Gretton et al., 2012 used as a goodness-of-fit test.
    Require the ability to sample from p i.e., the UnnormalizedDensity p has 
    to be able to return a non-None from get_datasource()
    """
    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        pds = p.get_datasource()
        datY = pds.sample(data.sample_size(), seed=r + 294)
        Y = datY.data()
        XY = np.vstack((X, Y))

        # If p, q differ very little, the median may be very small, rejecting H0
        # when it should not?
        medx = util.meddistance(X, subsample=1000)
        medy = util.meddistance(Y, subsample=1000)
        medxy = util.meddistance(XY, subsample=1000)
        med_avg = (medx + medy + medxy) / 3.0
        k = kernel.KGauss(med_avg**2)

        mmd_test = mgof.QuadMMDGof(p, k, n_permute=400, alpha=alpha, seed=r)
        mmd_result = mmd_test.perform_test(data)
    return {'test_result': mmd_result, 'time_secs': t.secs}
Example #3
0
def job_me_opt(p, data_source, tr, te, r, J=5):
    """
    ME test of Jitkrittum et al., 2016 used as a goodness-of-fit test.
    Gaussian kernel. Optimize test locations and Gaussian width.
    """
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        #pds = p.get_datasource()
        #datY = pds.sample(data.sample_size(), seed=r+294)
        #Y = datY.data()
        #XY = np.vstack((X, Y))
        #med = util.meddistance(XY, subsample=1000)
        op = {
            'n_test_locs': J,
            'seed': r + 5,
            'max_iter': 40,
            'batch_proportion': 1.0,
            'locs_step_size': 1.0,
            'gwidth_step_size': 0.1,
            'tol_fun': 1e-4,
            'reg': 1e-4
        }
        # optimize on the training set
        me_opt = tgof.GaussMETestOpt(p,
                                     n_locs=J,
                                     tr_proportion=tr_proportion,
                                     alpha=alpha,
                                     seed=r + 111)

        me_result = me_opt.perform_test(data, op)
    return {'test_result': me_result, 'time_secs': t.secs}
Example #4
0
def job_mmd_opt(p, data_source, tr, te, r):
    """
    MMD test of Gretton et al., 2012 used as a goodness-of-fit test.
    Require the ability to sample from p i.e., the UnnormalizedDensity p has 
    to be able to return a non-None from get_datasource()

    With optimization. Gaussian kernel.
    """
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic 
        pds = p.get_datasource()
        datY = pds.sample(data.sample_size(), seed=r+294)
        Y = datY.data()
        XY = np.vstack((X, Y))

        med = util.meddistance(XY, subsample=1000)

        # Construct a list of kernels to try based on multiples of the median
        # heuristic
        #list_gwidth = np.hstack( (np.linspace(20, 40, 10), (med**2)
        #    *(2.0**np.linspace(-2, 2, 20) ) ) )
        list_gwidth = (med**2)*(2.0**np.linspace(-4, 4, 30) ) 
        list_gwidth.sort()
        candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

        mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r)
        mmd_result = mmd_opt.perform_test(data,
                candidate_kernels=candidate_kernels,
                tr_proportion=tr_proportion, reg=1e-3)
    return { 'test_result': mmd_result, 'time_secs': t.secs}
Example #5
0
def job_lin_kstein_med(p, data_source, tr, te, r):
    """
    Linear-time version of the kernel Stein discrepancy test of Liu et al.,
    2016 and Chwialkowski et al., 2016. Use full sample.
    """
    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        med = util.meddistance(X, subsample=1000)
        k = kernel.KGauss(med**2)

        lin_kstein = gof.LinearKernelSteinTest(p, k, alpha=alpha, seed=r)
        lin_kstein_result = lin_kstein.perform_test(data)
    return {'test_result': lin_kstein_result, 'time_secs': t.secs}
Example #6
0
def job_kstein_imq(p, data_source, tr, te, r):
    """
    Kernel Stein discrepancy test of Liu et al., 2016 and Chwialkowski et al.,
    2016. Use full sample. Use the inverse multiquadric kernel (IMQ) studied 
    in 

    Measuring Sample Quality with Kernels
    Gorham and Mackey 2017. 

    Parameters are fixed to the recommented values: beta = b = -0.5, c = 1. 
    """
    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        k = kernel.KIMQ(b=-0.5, c=1.0)

        kstein = gof.KernelSteinTest(p, k, alpha=alpha, n_simulate=1000, seed=r)
        kstein_result = kstein.perform_test(data)
    return { 'test_result': kstein_result, 'time_secs': t.secs}
Example #7
0
def job_mmd_dgauss_opt(p, data_source, tr, te, r):
    """
    MMD test of Gretton et al., 2012 used as a goodness-of-fit test.
    Require the ability to sample from p i.e., the UnnormalizedDensity p has 
    to be able to return a non-None from get_datasource()

    With optimization. Diagonal Gaussian kernel where there is one Gaussian width
    for each dimension.
    """
    data = tr + te
    X = data.data()
    d = X.shape[1]
    with util.ContextTimer() as t:
        # median heuristic 
        pds = p.get_datasource()
        datY = pds.sample(data.sample_size(), seed=r+294)
        Y = datY.data()
        XY = np.vstack((X, Y))

        # Get the median heuristic for each dimension
        meds = np.zeros(d)
        for i in range(d):
            medi = util.meddistance(XY[:, [i]], subsample=1000)
            meds[i] = medi

        # Construct a list of kernels to try based on multiples of the median
        # heuristic
        med_factors = 2.0**np.linspace(-4, 4, 20)  
        candidate_kernels = []
        for i in range(len(med_factors)):
            ki = kernel.KDiagGauss( (meds**2)*med_factors[i] )
            candidate_kernels.append(ki)

        mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r+56)
        mmd_result = mmd_opt.perform_test(data,
                candidate_kernels=candidate_kernels,
                tr_proportion=tr_proportion, reg=1e-3)
    return { 'test_result': mmd_result, 'time_secs': t.secs}