Beispiel #1
0
def job_rdcperm_med(paired_source, tr, te, r, n_features=10):
    """
    The Randomized Dependence Coefficient test with permutations.
    """
    pdata = tr + te
    n_permute = 500
    # n_features=10 from Lopez-Paz et al., 2013 paper.
    with util.ContextTimer() as t:
        # get the median distances
        X, Y = pdata.xy()
        # copula transform to both X and Y
        cop_map = fea.MarginalCDFMap()
        Xcdf = cop_map.gen_features(X)
        Ycdf = cop_map.gen_features(Y)

        medx = util.meddistance(Xcdf, subsample=1000)
        medy = util.meddistance(Ycdf, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220)
        rdcperm = it.RDCPerm(fmx,
                             fmy,
                             n_permute=n_permute,
                             alpha=alpha,
                             seed=r + 100)
        rdcperm_result = rdcperm.perform_test(pdata)
    return {
        'indtest': rdcperm,
        'test_result': rdcperm_result,
        'time_secs': t.secs
    }
Beispiel #2
0
def job_rdc_med(paired_source, tr, te, r, n_features=10):
    """
    The Randomized Dependence Coefficient test.
    - Gaussian width = median heuristic on the copula-transformed data 
    - 10 random features for each X andY
    - Use full dataset for testing
    """
    pdata = tr + te
    # n_features=10 from Lopez-Paz et al., 2013 paper.
    with util.ContextTimer() as t:
        # get the median distances
        X, Y = pdata.xy()
        # copula transform to both X and Y
        cop_map = fea.MarginalCDFMap()
        Xcdf = cop_map.gen_features(X)
        Ycdf = cop_map.gen_features(Y)

        medx = util.meddistance(Xcdf, subsample=1000)
        medy = util.meddistance(Ycdf, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220)
        rdc = it.RDC(fmx, fmy, alpha=alpha)
        rdc_result = rdc.perform_test(pdata)
    return {'indtest': rdc, 'test_result': rdc_result, 'time_secs': t.secs}
Beispiel #3
0
def job_fhsic_med(paired_source, tr, te, r):
    """
    HSIC with random Fourier features. Simulate the null distribution 
    with the spectrums of the empirical cross covariance operators.
    - Gaussian kernels.
    - No parameter selection procedure. Use the median heuristic for both 
    X and Y.
    - Use full sample for testing. 
    """

    n_simulate = 2000
    # random features
    n_features = 10
    # use full sample for testing. Merge training and test sets
    pdata = tr + te
    with util.ContextTimer() as t:
        X, Y = pdata.xy()
        medx = util.meddistance(X, subsample=1000)
        medy = util.meddistance(Y, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 1)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 2)
        ffhsic = it.FiniteFeatureHSIC(fmx,
                                      fmy,
                                      n_simulate=n_simulate,
                                      alpha=alpha,
                                      seed=r + 89)
        ffhsic_result = ffhsic.perform_test(pdata)
    return {
        'indtest': ffhsic,
        'test_result': ffhsic_result,
        'time_secs': t.secs
    }
Beispiel #4
0
def job_rdcperm_nc_med(paired_source, tr, te, r, n_features=10):
    """
    The Randomized Dependence Coefficient test with permutations.
    No copula transformtation. Use median heuristic on the data.
    """
    pdata = tr + te
    n_permute = 500
    # n_features=10 from Lopez-Paz et al., 2013 paper.
    with util.ContextTimer() as t:
        # get the median distances
        X, Y = pdata.xy()

        medx = util.meddistance(X, subsample=1000)
        medy = util.meddistance(Y, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220)
        rdcperm = it.RDCPerm(fmx,
                             fmy,
                             n_permute=n_permute,
                             alpha=alpha,
                             seed=r + 100,
                             use_copula=False)
        rdcperm_result = rdcperm.perform_test(pdata)
    return {
        'indtest': rdcperm,
        'test_result': rdcperm_result,
        'time_secs': t.secs
    }
Beispiel #5
0
    def test_rdc(self):
        feature_pairs = 10
        n = 30
        for f in range(1, 7):
            ps = data.PS2DSinFreq(freq=1)
            pdata = ps.sample(n, seed=f + 4)
            fmx = fea.RFFKGauss(1, feature_pairs, seed=f + 10)
            fmy = fea.RFFKGauss(2.0, feature_pairs + 1, seed=f + 9)
            rdc = it.RDC(fmx, fmy, alpha=0.01)
            stat, evals = rdc.compute_stat(pdata, return_eigvals=True)

            self.assertGreaterEqual(stat, 0)
            abs_evals = np.abs(evals)
            self.assertTrue(np.all(abs_evals >= 0))
            self.assertTrue(np.all(abs_evals <= 1))
Beispiel #6
0
    def test_list_permute_spectral(self):
        # make sure that simulating from the spectral approach is roughly the
        # same as doing permutations.
        ps = data.PS2DSinFreq(freq=2)
        n_features = 5
        n_simulate = 3000
        n_permute = 3000
        for s in [283, 2]:
            with util.NumpySeedContext(seed=s):
                pdata = ps.sample(n=200, seed=s + 1)
                X, Y = pdata.xy()

                sigmax2 = 1
                sigmay2 = 0.8
                fmx = feature.RFFKGauss(
                    sigmax2, n_features=n_features, seed=s + 3
                )
                fmy = feature.RFFKGauss(
                    sigmay2, n_features=n_features, seed=s + 23
                )

                Zx = fmx.gen_features(X)
                Zy = fmy.gen_features(Y)
                list_perm = indtest.FiniteFeatureHSIC.list_permute(
                    X, Y, fmx, fmy, n_permute=n_permute, seed=s + 82
                )
                (
                    list_spectral,
                    _,
                    _,
                ) = indtest.FiniteFeatureHSIC.list_permute_spectral(
                    Zx, Zy, n_simulate=n_simulate, seed=s + 119
                )

                # make sure that the relative frequency of the histogram does
                # not differ much.
                freq_p, _ = np.histogram(list_perm)
                freq_s, _ = np.histogram(list_spectral)
                nfreq_p = freq_p / np.sum(freq_p)
                nfreq_s = freq_s / np.sum(freq_s)
                arr_diff = np.abs(nfreq_p - nfreq_s)
                self.assertTrue(np.all(arr_diff <= 0.2))
Beispiel #7
0
    def test_approximation(self):
        n = 100
        d = 3
        X = np.random.rand(n, d) * 2 - 4

        sigma2 = 2.7
        feature_pairs = 50
        rff = feature.RFFKGauss(sigma2, feature_pairs, seed=2)
        Z = rff.gen_features(X)
        Krff = Z.dot(Z.T)

        # check approximation quality
        k = kernel.KGauss(sigma2)
        K = k.eval(X, X)
        diff = np.linalg.norm((Krff - K), "fro")
        self.assertLessEqual(diff / n ** 2, 0.5)
Beispiel #8
0
    def test_general(self):
        n = 31
        d = 3
        X = np.random.rand(n, d) * 2 - 4

        sigma2 = 3.7
        feature_pairs = 51
        rff = feature.RFFKGauss(sigma2, feature_pairs, seed=2)
        Z = rff.gen_features(X)
        Z2 = rff.gen_features(X)

        # assert sizes
        self.assertEqual(Z.shape[0], n)
        self.assertEqual(Z.shape[1], 2 * feature_pairs)

        # assert deterministicity
        np.testing.assert_array_almost_equal(Z, Z2)