Exemple #1
0
def perform_mmd_test(train_miss_impute,
                     test_miss_impute,
                     train_full,
                     test_full,
                     alpha,
                     mmd_miss_impute=None,
                     mmd_full=None):

    mmd_result = np.zeros(2)

    sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute)
    if mmd_miss_impute is None:
        print('ini')
        x, y = sb_data_miss_impute.xy()
        dist_mat_miss_impute = metrics.pairwise_distances(x, y)
        the_kernel = kernel.KGauss(dist_mat_miss_impute.std())
        mmd_miss_impute = tst.QuadMMDTest(the_kernel, alpha=alpha)
    test_result = mmd_miss_impute.perform_test(sb_data_miss_impute)
    if test_result['h0_rejected']:
        mmd_result[0] = 1

    sb_data_full = TSTData(train_full, test_full)
    if mmd_full is None:
        x, y = sb_data_full.xy()
        dist_mat_full = metrics.pairwise_distances(x, y)
        the_kernel = kernel.KGauss(dist_mat_full.std())
        mmd_full = tst.QuadMMDTest(the_kernel, alpha=alpha)
    test_result = mmd_full.perform_test(sb_data_full)
    if test_result['h0_rejected']:
        mmd_result[1] = 1

    return mmd_result, mmd_miss_impute, mmd_full
Exemple #2
0
def perform_mmd_test(train_miss_impute,
                     test_miss_impute,
                     train_full,
                     test_full,
                     alpha,
                     mmd_miss_impute=None,
                     mmd_full=None):

    mmd_result = np.zeros(2)

    sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute)
    if mmd_miss_impute is None:
        print('ini')
        the_kernel = kernel.KGauss(sb_data_miss_impute.mean_std())
        mmd_miss_impute = tst.QuadMMDTest(the_kernel, alpha=alpha)
    test_result = mmd_miss_impute.perform_test(sb_data_miss_impute)
    if test_result['h0_rejected']:
        mmd_result[0] = 1

    sb_data_full = TSTData(train_full, test_full)
    if mmd_full is None:
        the_kernel = kernel.KGauss(sb_data_full.mean_std())
        mmd_full = tst.QuadMMDTest(the_kernel, alpha=alpha)
    test_result = mmd_full.perform_test(sb_data_full)
    if test_result['h0_rejected']:
        mmd_result[1] = 1

    return mmd_result, mmd_miss_impute, mmd_full
Exemple #3
0
def job_quad_mmd_2U(sample_source, tr, te, r):
    """Quadratic mmd with grid search to choose the best Gaussian width.
    Use two-sample U statistics to compute k(X,Y).
    """
    # If n is too large, pairwise meddian computation can cause a memory error.

    with util.ContextTimer() as t:
        med = util.meddistance(tr.stack_xy(), 1000)
        list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 40))))
        list_gwidth.sort()
        list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

        # grid search to choose the best Gaussian width
        besti, powers = tst.QuadMMDTest.grid_search_kernel(
            tr, list_kernels, alpha)
        # perform test
        best_ker = list_kernels[besti]
        mmd_test = tst.QuadMMDTest(best_ker,
                                   n_permute=1000,
                                   alpha=alpha,
                                   use_1sample_U=False)
        test_result = mmd_test.perform_test(te)
    result = {
        'test_method': mmd_test,
        'test_result': test_result,
        'time_secs': t.secs
    }
    return result
Exemple #4
0
    def perform_test(
        self,
        dat,
        candidate_kernels=None,
        return_mmdtest=False,
        tr_proportion=0.2,
        reg=1e-3,
    ):
        """
        dat: an instance of Data
        candidate_kernels: a list of Kernel's to choose from
        tr_proportion: proportion of sample to be used to choosing the best
            kernel
        reg: regularization parameter for the test power criterion
        """
        with util.ContextTimer() as t:
            seed = self.seed
            p = self.p
            ds = p.get_datasource()
            p_sample = ds.sample(dat.sample_size(), seed=seed + 77)
            xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion,
                                            seed=seed + 18)
            # ytr, yte are of type data.Data
            ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion,
                                       seed=seed + 12)

            # training and test data
            tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
            te_tst_data = fdata.TSTData(xte.data(), yte.data())

            if candidate_kernels is None:
                # Assume a Gaussian kernel. Construct a list of
                # kernels to try based on multiples of the median heuristic
                med = util.meddistance(tr_tst_data.stack_xy(), 1000)
                list_gwidth = np.hstack(
                    ((med**2) * (2.0**np.linspace(-4, 4, 10))))
                list_gwidth.sort()
                candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

            alpha = self.alpha

            # grid search to choose the best Gaussian width
            besti, powers = tst.QuadMMDTest.grid_search_kernel(
                tr_tst_data, candidate_kernels, alpha, reg=reg)
            # perform test
            best_ker = candidate_kernels[besti]
            mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha)
            results = mmdtest.perform_test(te_tst_data)
            if return_mmdtest:
                results["mmdtest"] = mmdtest

        results["time_secs"] = t.secs
        return results
    def test(self, X, Y):
        XY = self.preprocess(X, Y)

        med = fot_util.meddistance(XY.stack_xy(), 1000)
        kernel = fot_kernel.KGauss(med)

        MMD = fot_tst.QuadMMDTest(kernel,
                                  n_permute=self.n_permute,
                                  alpha=self.alpha)

        result = MMD.perform_test(XY)
        p_val = result['pvalue']
        return p_val
def job_quad_mmd(sample_source, tr, te, r):
    """Quadratic mmd with grid search to choose the best Gaussian width."""
    # If n is too large, pairwise meddian computation can cause a memory error. 

    med = util.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 30) ) ) )
    list_gwidth.sort()
    list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

    # grid search to choose the best Gaussian width
    besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha)
    # perform test 
    best_ker = list_kernels[besti]
    mmd_test = tst.QuadMMDTest(best_ker, n_permute=400, alpha=alpha)
    test_result = mmd_test.perform_test(te)
    return test_result
Exemple #7
0
 def __init__(self, p, k, n_permute=400, alpha=0.01, seed=28):
     """
     p: an instance of UnnormalizedDensity
     k: an instance of Kernel
     n_permute: number of times to permute the samples to simulate from the 
         null distribution (permutation test)
     alpha: significance level 
     seed: random seed
     """
     super(QuadMMDGof, self).__init__(p, alpha)
     # Construct the MMD test
     self.mmdtest = tst.QuadMMDTest(k, n_permute=n_permute, alpha=alpha)
     self.k = k
     self.seed = seed
     ds = p.get_datasource()
     if ds is None:
         raise ValueError('%s test requires a density p which implements get_datasource(', str(QuadMMDGof))
Exemple #8
0
    def __init__(self, p, k, l, n_permute=400, alpha=0.01, seed=11):
        # logging.warning(('This test does not accept Pytorch '
        #                  'kernels starting with prefix PT'))

        import freqopttest.tst as tst
        super(MMDTest, self).__init__(p, alpha)
        self.p = p
        self.k = k
        self.l = l
        self.ds_p = self.p.get_condsource()
        if self.ds_p is None:
            raise ValueError(
                'The test requires that p can be sampled. Must implement p.get_condsource().'
            )
        self.alpha = alpha
        self.seed = seed
        self.n_permute = n_permute
        kprod = ker.KTwoProduct(k, l, p.dx(), p.dy())
        self.mmdtest = tst.QuadMMDTest(kprod, n_permute, alpha=alpha)
    def test(self, X, Y):
        XY = self.preprocess(X, Y)

        train, test = XY.split_tr_te(tr_proportion=self.split_ratio)
        med = fot_util.meddistance(train.stack_xy(), 1000)

        bandwidths = (med**2) * (2.**np.linspace(-4, 4, 20))
        kernels = [fot_kernel.KGauss(width) for width in bandwidths]
        with contextlib.redirect_stdout(None):
            best_i, powers = fot_tst.QuadMMDTest.grid_search_kernel(
                train, kernels, alpha=self.alpha)
        best_kernel = kernels[best_i]

        MMD = fot_tst.QuadMMDTest(best_kernel,
                                  n_permute=self.n_permute,
                                  alpha=self.alpha)

        result = MMD.perform_test(test)
        p_val = result['pvalue']
        return p_val