def job_rdcperm_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test with permutations. """ pdata = tr + te n_permute = 500 # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() # copula transform to both X and Y cop_map = fea.MarginalCDFMap() Xcdf = cop_map.gen_features(X) Ycdf = cop_map.gen_features(Y) medx = util.meddistance(Xcdf, subsample=1000) medy = util.meddistance(Ycdf, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdcperm = it.RDCPerm(fmx, fmy, n_permute=n_permute, alpha=alpha, seed=r + 100) rdcperm_result = rdcperm.perform_test(pdata) return { 'indtest': rdcperm, 'test_result': rdcperm_result, 'time_secs': t.secs }
def job_rdc_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test. - Gaussian width = median heuristic on the copula-transformed data - 10 random features for each X andY - Use full dataset for testing """ pdata = tr + te # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() # copula transform to both X and Y cop_map = fea.MarginalCDFMap() Xcdf = cop_map.gen_features(X) Ycdf = cop_map.gen_features(Y) medx = util.meddistance(Xcdf, subsample=1000) medy = util.meddistance(Ycdf, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdc = it.RDC(fmx, fmy, alpha=alpha) rdc_result = rdc.perform_test(pdata) return {'indtest': rdc, 'test_result': rdc_result, 'time_secs': t.secs}
def job_fhsic_med(paired_source, tr, te, r): """ HSIC with random Fourier features. Simulate the null distribution with the spectrums of the empirical cross covariance operators. - Gaussian kernels. - No parameter selection procedure. Use the median heuristic for both X and Y. - Use full sample for testing. """ n_simulate = 2000 # random features n_features = 10 # use full sample for testing. Merge training and test sets pdata = tr + te with util.ContextTimer() as t: X, Y = pdata.xy() medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 1) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 2) ffhsic = it.FiniteFeatureHSIC(fmx, fmy, n_simulate=n_simulate, alpha=alpha, seed=r + 89) ffhsic_result = ffhsic.perform_test(pdata) return { 'indtest': ffhsic, 'test_result': ffhsic_result, 'time_secs': t.secs }
def job_rdcperm_nc_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test with permutations. No copula transformtation. Use median heuristic on the data. """ pdata = tr + te n_permute = 500 # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdcperm = it.RDCPerm(fmx, fmy, n_permute=n_permute, alpha=alpha, seed=r + 100, use_copula=False) rdcperm_result = rdcperm.perform_test(pdata) return { 'indtest': rdcperm, 'test_result': rdcperm_result, 'time_secs': t.secs }
def test_rdc(self): feature_pairs = 10 n = 30 for f in range(1, 7): ps = data.PS2DSinFreq(freq=1) pdata = ps.sample(n, seed=f + 4) fmx = fea.RFFKGauss(1, feature_pairs, seed=f + 10) fmy = fea.RFFKGauss(2.0, feature_pairs + 1, seed=f + 9) rdc = it.RDC(fmx, fmy, alpha=0.01) stat, evals = rdc.compute_stat(pdata, return_eigvals=True) self.assertGreaterEqual(stat, 0) abs_evals = np.abs(evals) self.assertTrue(np.all(abs_evals >= 0)) self.assertTrue(np.all(abs_evals <= 1))
def test_list_permute_spectral(self): # make sure that simulating from the spectral approach is roughly the # same as doing permutations. ps = data.PS2DSinFreq(freq=2) n_features = 5 n_simulate = 3000 n_permute = 3000 for s in [283, 2]: with util.NumpySeedContext(seed=s): pdata = ps.sample(n=200, seed=s + 1) X, Y = pdata.xy() sigmax2 = 1 sigmay2 = 0.8 fmx = feature.RFFKGauss( sigmax2, n_features=n_features, seed=s + 3 ) fmy = feature.RFFKGauss( sigmay2, n_features=n_features, seed=s + 23 ) Zx = fmx.gen_features(X) Zy = fmy.gen_features(Y) list_perm = indtest.FiniteFeatureHSIC.list_permute( X, Y, fmx, fmy, n_permute=n_permute, seed=s + 82 ) ( list_spectral, _, _, ) = indtest.FiniteFeatureHSIC.list_permute_spectral( Zx, Zy, n_simulate=n_simulate, seed=s + 119 ) # make sure that the relative frequency of the histogram does # not differ much. freq_p, _ = np.histogram(list_perm) freq_s, _ = np.histogram(list_spectral) nfreq_p = freq_p / np.sum(freq_p) nfreq_s = freq_s / np.sum(freq_s) arr_diff = np.abs(nfreq_p - nfreq_s) self.assertTrue(np.all(arr_diff <= 0.2))
def test_approximation(self): n = 100 d = 3 X = np.random.rand(n, d) * 2 - 4 sigma2 = 2.7 feature_pairs = 50 rff = feature.RFFKGauss(sigma2, feature_pairs, seed=2) Z = rff.gen_features(X) Krff = Z.dot(Z.T) # check approximation quality k = kernel.KGauss(sigma2) K = k.eval(X, X) diff = np.linalg.norm((Krff - K), "fro") self.assertLessEqual(diff / n ** 2, 0.5)
def test_general(self): n = 31 d = 3 X = np.random.rand(n, d) * 2 - 4 sigma2 = 3.7 feature_pairs = 51 rff = feature.RFFKGauss(sigma2, feature_pairs, seed=2) Z = rff.gen_features(X) Z2 = rff.gen_features(X) # assert sizes self.assertEqual(Z.shape[0], n) self.assertEqual(Z.shape[1], 2 * feature_pairs) # assert deterministicity np.testing.assert_array_almost_equal(Z, Z2)