Ejemplo n.º 1
0
    def compute(self):
        # from prob_label, get p, rx, cs, n
        ns, p, rx, cs = get_ns_model_source(self.prob_label)
        r = self.rep
        n = self.n
        met_func = self.met_func
        prob_label = self.prob_label

        logger.info("computing. %s. prob=%s, r=%d,\
                n=%d" % (met_func.__name__, prob_label, r, n))
        with util.ContextTimer() as t:
            job_result = met_func(p, rx, cs, n, r)

            # create ScalarResult instance
            result = SingleResult(job_result)
            # submit the result to my own aggregator
            self.aggregator.submit_result(result)
            func_name = met_func.__name__

        logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " %
                    (func_name, prob_label, r, n, t.secs))

        # save result
        fname = '%s-%s-n%d_r%d_a%.3f.p' \
                %(prob_label, func_name, n, r, alpha )
        glo.ex_save_result(ex, job_result, prob_label, fname)
Ejemplo n.º 2
0
def met_gkcsd_med(p, rx, cond_source, n, r):
    """
    KCSD test with Gaussian kernels (for both kernels). Prefix g = Gaussian kernel.
    med = Use median heuristic to choose the bandwidths for both kernels.
    Compute the median heuristic on the data X and Y separate to get the two
    bandwidths.
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 3)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        # Construct a KCSD test object
        kcsdtest = cgof.KCSDTest(p,
                                 k,
                                 l,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 88)
        result = kcsdtest.perform_test(X, Y)

    return {
        # 'test': kcsdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 3
0
    def perform_test(self, X, Y):

        import freqopttest.data as fdata
        ds_p = self.ds_p
        mmdtest = self.mmdtest
        seed = self.seed

        with util.ContextTimer() as t:
            # split the data
            X1, Y1, X2, Y2 = MMDSplitTest._split_half(X,
                                                      Y,
                                                      seed=self.seed + 330)

            # Draw sample from p
            Y2_ = ds_p.cond_pair_sample(X2, seed=seed + 13)
            real_data = torch.cat([X1, Y1], dim=1).numpy()
            model_data = torch.cat([X2, Y2_], dim=1).numpy()

            # Run the two-sample test on p_sample and dat
            # Make a two-sample test data
            tst_data = fdata.TSTData(real_data, model_data)
            # Test
            results = mmdtest.perform_test(tst_data)

        results['time_secs'] = t.secs
        return results
Ejemplo n.º 4
0
def met_gmmd_med(p, rx, cond_source, n, r):
    """
    A naive baseline which samples from the conditional density model p to
    create a new joint sample. The test is performed with a two-sample MMD
    test comparing the two joint samples. Use a Gaussian kernel for both X
    and Y with median heuristic.
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 3)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38)

        # kernels
        # k = kernel on X. Need a kernel that can operator on numpy arrays
        k = kgof.kernel.KGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = kgof.kernel.KGauss(sigma2=sigy**2)

        # Construct an MMD test object. Require freqopttest package.
        mmdtest = cgof.MMDTest(p,
                               k,
                               l,
                               n_permute=400,
                               alpha=alpha,
                               seed=r + 37)
        result = mmdtest.perform_test(X, Y)

    return {
        # 'test': mmdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 5
0
    def perform_test(self, X, Y):
        with util.ContextTimer() as t:
            alpha = self.alpha
            n_bootstrap = self.n_bootstrap
            n = X.shape[0]
            ds = self.p.get_condsource()

            test_stat = self.compute_stat(X, Y)
            # bootstrapping
            sim_stats = torch.zeros(n_bootstrap)
            with torch.no_grad():
                with util.TorchSeedContext(seed=self.seed):
                    for i in range(n_bootstrap):
                        idx = torch.randint(0, n, [n])
                        X_ = X[idx]
                        Y_ = ds.cond_pair_sample(X_, self.seed + i)
                        # Bootstrapped statistic
                        Hnb = CramerVonMisesTest.Hn(X_, Y_, X, Y)
                        Hn0b = self.Hn0(X_, Y_, X, Y)
                        boot_stat = torch.sum((Hnb - Hn0b)**2)
                        sim_stats[i] = boot_stat

            # approximate p-value with the permutations
            I = sim_stats > test_stat
            pvalue = torch.mean(I.type(torch.float)).item()

        results = {
            'alpha': self.alpha,
            'pvalue': pvalue,
            'test_stat': test_stat.item(),
            'h0_rejected': pvalue < alpha,
            'n_simulate': n_bootstrap,
            'time_secs': t.secs,
        }
        return results
Ejemplo n.º 6
0
    def perform_test(self, X, Y):
        """
        X: Torch tensor of size n x dx
        Y: Torch tensor of size n x dy

        perform the goodness-of-fit test and return values computed in a
        dictionary:
        {
            alpha: 0.01, 
            pvalue: 0.0002, 
            test_stat: 2.3, 
            h0_rejected: True, 
            time_secs: ...
        }
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            stat = self.compute_stat(X, Y)
            pvalue = (1 - dists.Normal(0, 1).cdf(stat)).item()

        results = {
            'alpha': self.alpha,
            'pvalue': pvalue,
            'test_stat': stat.item(),
            'h0_rejected': pvalue < alpha,
            'time_secs': t.secs,
        }
        return results
Ejemplo n.º 7
0
def met_gmmd_split_med(p, rx, cond_source, n, r):
    """ 
    Same as met_gmmd_med but perform data splitting to guarantee that the
    two sets of samples are independent. Effective sample size is then n/2.
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 4)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 39)

        # kernels
        # k = kernel on X. Need a kernel that can operator on numpy arrays
        k = kgof.kernel.KGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = kgof.kernel.KGauss(sigma2=sigy**2)

        # Construct an MMD test object. Require freqopttest package.
        mmdtest = cgof.MMDSplitTest(p,
                                    k,
                                    l,
                                    n_permute=400,
                                    alpha=alpha,
                                    seed=r + 47)
        result = mmdtest.perform_test(X, Y)

    return {
        # 'test': mmdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 8
0
 def perform_test(self, X, Y):
     with util.ContextTimer() as t:
         alpha = self.alpha
         stat = self.compute_stat(X, Y)
         pvalue = (1 - dists.Normal(0, 1).cdf(stat)).item()
     results = {
         'alpha': self.alpha,
         'pvalue': pvalue,
         'test_stat': stat.item(),
         'h0_rejected': pvalue < alpha,
         'time_secs': t.secs,
     }
     return results
Ejemplo n.º 9
0
def met_zheng_cdf(p, rx, cond_source, n, r):
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        # the test
        zheng_cdf = cgof.ZhengCDFTest(p, alpha)
        result = zheng_cdf.perform_test(X, Y)

    return {
        # 'test': zheng_test,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 10
0
    def perform_test(self,
                     X,
                     Y,
                     return_simulated_stats=False,
                     return_ustat_gram=False):
        """
        X,Y: torch tensors. 
        return_simulated_stats: If True, also include the boostrapped
            statistics in the returned dictionary.
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            n_bootstrap = self.n_bootstrap
            n = X.shape[0]

            test_stat, H = self.compute_stat(X, Y, return_ustat_gram=True)
            # bootstrapping
            sim_stats = torch.zeros(n_bootstrap)
            mult_dist = dists.multinomial.Multinomial(total_count=n,
                                                      probs=torch.ones(n) / n)
            with torch.no_grad():
                with util.TorchSeedContext(seed=self.seed):
                    for i in range(n_bootstrap):
                        W = mult_dist.sample()
                        Wt = (W - 1.0) / n
                        # Bootstrapped statistic
                        boot_stat = n * (H.matmul(Wt).dot(Wt) -
                                         torch.diag(H).dot(Wt**2))
                        sim_stats[i] = boot_stat

            # approximate p-value with the permutations
            I = sim_stats > test_stat
            pvalue = torch.mean(I.type(torch.float)).item()

        results = {
            'alpha': self.alpha,
            'pvalue': pvalue,
            'test_stat': test_stat.item(),
            'h0_rejected': pvalue < alpha,
            'n_simulate': n_bootstrap,
            'time_secs': t.secs,
        }
        if return_simulated_stats:
            results['sim_stats'] = sim_stats.detach().numpy()
        if return_ustat_gram:
            results['H'] = H

        return results
Ejemplo n.º 11
0
def met_zhengkl_gh(p, rx, cond_source, n, r):
    """
    Zheng 2000 test implemented with Gauss Hermite quadrature.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    rate = (cond_source.dx() + cond_source.dy()) * 4. / 5
    # start timing
    with util.ContextTimer() as t:
        # the test
        zheng_gh = cgof.ZhengKLTestGaussHerm(p, alpha, rate=rate)
        result = zheng_gh.perform_test(X, Y)

    return {
        # 'test': zheng_test,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 12
0
def met_zhengkl_mc(p, rx, cond_source, n, r):
    """
    Zheng 2000 test implemented with Monte Carlo integration.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        # number of Monte Carlo particles
        n_mc = 10000
        # the test
        zheng_mc = cgof.ZhengKLTestMC(p, alpha, n_mc=n_mc)
        result = zheng_mc.perform_test(X, Y)

    return {
        # 'test': zheng_test,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 13
0
def met_gfscd_J1_rand(p, rx, cond_source, n, r, J=1):
    """
    FSCD test with Gaussian kernels on both X and Y.
    * Use J=1 random test location by default.
    * The test locations are drawn from a Gaussian fitted to the data drawn
        from rx.
    * Bandwithds of the Gaussian kernels are determined by the median
        heuristic.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        tr, te = cdat.CondData(X, Y).split_tr_te(tr_proportion=0.3)
        Xtr, Ytr = tr.xy()
        # fit a Gaussian and draw J locations
        npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 750)
        V = torch.tensor(npV, dtype=torch.float)

        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=2 + r)
        sigy = util.pt_meddistance(Y, subsample=600, seed=93 + r)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        # Construct a FSCD test object
        fscdtest = cgof.FSCDTest(p,
                                 k,
                                 l,
                                 V,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 8)
        # test on the full samples
        result = fscdtest.perform_test(X, Y)

    return {
        # 'test': fscdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 14
0
def met_cramer_vm(p, rx, cond_source, n, r):
    """
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # Construct a CramerVonMisesTest test object
        cvm = cgof.CramerVonMisesTest(p,
                                      alpha=alpha,
                                      n_bootstrap=200,
                                      seed=r + 88)
        result = cvm.perform_test(X, Y)

    return {
        # 'test': kcsdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 15
0
def met_zhengkl(p, rx, cond_source, n, r):
    """
    "Zheng 2000, A CONSISTENT TEST OF CONDITIONAL PARAMETRIC DISTRIBUTIONS", 
    which uses the first order approximation of KL divergence as the decision
    criterion. 
    Use cgoftest.ZhengKLTest.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:

        # the test
        zheng_test = cgof.ZhengKLTest(p, alpha)
        result = zheng_test.perform_test(X, Y)

    return {
        # 'test': zheng_test,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 16
0
def met_gkcsd_opt_tr50(p, rx, cond_source, n, r, tr_proportion=0.5):
    """
    KCSD test with Gaussian kernels (for both kernels). 
    Optimize the kernel bandwidths by maximizing the power criterin of the
    KCSD test.
    med = Use median heuristic to choose the bandwidths for both kernels.
    Compute the median heuristic on the data X and Y separate to get the two
    bandwidths.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 7)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 99)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        # split the data
        cd = cdat.CondData(X, Y)
        tr, te = cd.split_tr_te(tr_proportion=tr_proportion)

        # training data
        Xtr, Ytr = tr.xy()
        # abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item()
        # abs_stdx = torch.std(Xtr).item()
        # abs_stdy = torch.std(Ytr).item()

        kcsd_pc = cgof.KCSDPowerCriterion(p, k, l, Xtr, Ytr)

        max_iter = 100
        # learning rate
        lr = 1e-3
        # regularization in the power criterion
        reg = 1e-3

        # constraint satisfaction function
        def con_f(params):
            ksigma2 = params[0]
            lsigma2 = params[1]
            ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2)
            lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2)

        kcsd_pc.optimize_params([k.sigma2, l.sigma2],
                                constraint_f=con_f,
                                lr=lr,
                                reg=reg,
                                max_iter=max_iter)

        # Construct a KCSD test object
        kcsdtest = cgof.KCSDTest(p,
                                 k,
                                 l,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 88)
        Xte, Yte = te.xy()
        # test on the test set
        result = kcsdtest.perform_test(Xte, Yte)

    return {
        # 'test': kcsdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Ejemplo n.º 17
0
def met_gfscd_J1_opt_tr50(p, rx, cond_source, n, r, J=1, tr_proportion=0.5):
    """
    FSCD test with Gaussian kernels on both X and Y.
    Optimize both Gaussian bandwidhts and the test locations by maximizing
    the test power.
    The proportion of the training data used for the optimization is
    controlled by tr_proportion.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        # split the data
        cd = cdat.CondData(X, Y)
        tr, te = cd.split_tr_te(tr_proportion=tr_proportion)

        # training data
        Xtr, Ytr = tr.xy()

        # fit a Gaussian and draw J locations as an initial point for V
        npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 75)

        V = torch.tensor(npV, dtype=torch.float)

        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=30 + r)
        sigy = util.pt_meddistance(Y, subsample=600, seed=40 + r)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item()
        abs_std = torch.std(Xtr).item()

        # parameter tuning
        fscd_pc = cgof.FSCDPowerCriterion(p, k, l, Xtr, Ytr)
        max_iter = 200
        # learning rate
        lr = 1e-2
        # regularization parameter when forming the power criterion
        reg = 1e-4

        # constraint satisfaction function
        def con_f(params, V):
            ksigma2 = params[0]
            lsigma2 = params[1]
            ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2)
            lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2)
            V.data.clamp_(min=abs_min - 2.0 * abs_std,
                          max=abs_max + 2.0 * abs_std)

        # do the optimization. Parameters are optimized in-place
        fscd_pc.optimize_params([k.sigma2, l.sigma2],
                                V,
                                constraint_f=con_f,
                                lr=lr,
                                reg=reg,
                                max_iter=max_iter)

        # Now that k, l, and V are optimized. Construct a FSCD test object
        fscdtest = cgof.FSCDTest(p,
                                 k,
                                 l,
                                 V,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 8)
        Xte, Yte = te.xy()
        # test only on the test samples
        result = fscdtest.perform_test(Xte, Yte)

    return {
        # 'test': fscdtest,
        'test_result': result,
        'time_secs': t.secs
    }