def job_nfsicJ10_med(paired_source, tr, te, r, n_permute=None): """ NFSIC in which the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. J=10 """ J = 10 pdata = tr + te with util.ContextTimer() as t: #V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=r+2) # May overfit and increase type-I errors? #V, W = it.GaussNFSIC.init_locs_joint_randn(pdata, J, seed=r+2) with util.NumpySeedContext(seed=r + 92): dx = pdata.dx() dy = pdata.dy() V = np.random.randn(J, dx) W = np.random.randn(J, dy) k, l = kl_kgauss_median(pdata) nfsic_med = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto', n_permute=n_permute, seed=r + 3) nfsic_med_result = nfsic_med.perform_test(pdata) return { 'indtest': nfsic_med, 'test_result': nfsic_med_result, 'time_secs': t.secs }
def job_rdcperm_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test with permutations. """ pdata = tr + te n_permute = 500 # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() # copula transform to both X and Y cop_map = fea.MarginalCDFMap() Xcdf = cop_map.gen_features(X) Ycdf = cop_map.gen_features(Y) medx = util.meddistance(Xcdf, subsample=1000) medy = util.meddistance(Ycdf, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdcperm = it.RDCPerm(fmx, fmy, n_permute=n_permute, alpha=alpha, seed=r + 100) rdcperm_result = rdcperm.perform_test(pdata) return { 'indtest': rdcperm, 'test_result': rdcperm_result, 'time_secs': t.secs }
def job_fhsic_med(paired_source, tr, te, r): """ HSIC with random Fourier features. Simulate the null distribution with the spectrums of the empirical cross covariance operators. - Gaussian kernels. - No parameter selection procedure. Use the median heuristic for both X and Y. - Use full sample for testing. """ n_simulate = 2000 # random features n_features = 10 # use full sample for testing. Merge training and test sets pdata = tr + te with util.ContextTimer() as t: X, Y = pdata.xy() medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 1) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 2) ffhsic = it.FiniteFeatureHSIC(fmx, fmy, n_simulate=n_simulate, alpha=alpha, seed=r + 89) ffhsic_result = ffhsic.perform_test(pdata) return { 'indtest': ffhsic, 'test_result': ffhsic_result, 'time_secs': t.secs }
def job_nyhsic_med(paired_source, tr, te, r): """ HSIC with Nystrom approximation. Simulate the null distribution with the spectrums of the empirical cross covariance operators. - Gaussian kernels. - No parameter selection procedure. Use the median heuristic for both X and Y. - Use full sample for testing. """ n_simulate = 2000 # random features n_features = 10 # use full sample for testing. Merge training and test sets pdata = tr + te with util.ContextTimer() as t: X, Y = pdata.xy() k, l = kl_kgauss_median(pdata) # randomly choose the inducing points from X, Y induce_x = util.subsample_rows(X, n_features, seed=r + 2) induce_y = util.subsample_rows(Y, n_features, seed=r + 3) nyhsic = it.NystromHSIC(k, l, induce_x, induce_y, n_simulate=n_simulate, alpha=alpha, seed=r + 89) nyhsic_result = nyhsic.perform_test(pdata) return { 'indtest': nyhsic, 'test_result': nyhsic_result, 'time_secs': t.secs }
def compute(self): # randomly wait a few seconds so that multiple processes accessing the same # Theano function do not cause a lock problem. I do not know why. # I do not know if this does anything useful. # Sleep in seconds. time.sleep(np.random.rand(1) * 3) paired_source = self.paired_source r = self.rep n = self.n job_func = self.job_func pdata = paired_source.sample(n, seed=r) with util.ContextTimer() as t: logger.info("computing. %s. prob=%s, r=%d, n=%d" % (job_func.__name__, pdata.label, r, n)) tr, te = pdata.split_tr_te(tr_proportion=tr_proportion, seed=r + 21) prob_label = self.prob_label job_result = job_func(paired_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = job_func.__name__ logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, pdata.label, r, n, t.secs)) # save result fname = '%s-%s-r%d_n%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, r, n, alpha, tr_proportion) glo.ex_save_result(ex, job_result, prob_label, fname)
def job_qhsic_med(paired_source, tr, te, r): """ Quadratic-time HSIC using the permutation test. - Gaussian kernels. - No parameter selection procedure. Use the median heuristic for both X and Y. - Use full sample for testing. """ # use full sample for testing. Merge training and test sets pdata = tr + te n_permute = 500 if pdata.sample_size() >= 5000: # give up. Too big. k, l = kl_kgauss_median(pdata) qhsic = it.QuadHSIC(k, l, n_permute, alpha=alpha, seed=r + 1) fake_result = { 'alpha': alpha, 'pvalue': 1, 'test_stat': -1, 'h0_rejected': False, 'time_secs': 0, 'n_permute': n_permute } return {'indtest': qhsic, 'test_result': fake_result, 'time_secs': 0} # Actually do the test with util.ContextTimer() as t: k, l = kl_kgauss_median(pdata) qhsic = it.QuadHSIC(k, l, n_permute, alpha=alpha, seed=r + 1) qhsic_result = qhsic.perform_test(pdata) return {'indtest': qhsic, 'test_result': qhsic_result, 'time_secs': t.secs}
def job_nfsicJ3_opt(paired_source, tr, te, r, J=3): """NFSIC with test locations optimzied. """ with util.ContextTimer() as t: nfsic_opt_options = { 'n_test_locs': J, 'max_iter': 200, 'V_step': 1, 'W_step': 1, 'gwidthx_step': 1, 'gwidthy_step': 1, 'batch_proportion': 1.0, 'tol_fun': 1e-4, 'step_pow': 0.5, 'seed': r + 2, 'reg': 1e-6 } op_V, op_W, op_gwx, op_gwy, info = it.GaussNFSIC.optimize_locs_widths( tr, alpha, **nfsic_opt_options) nfsic_opt = it.GaussNFSIC(op_gwx, op_gwy, op_V, op_W, alpha, reg='auto', seed=r + 3) nfsic_opt_result = nfsic_opt.perform_test(te) return { 'indtest': nfsic_opt, 'test_result': nfsic_opt_result, 'time_secs': t.secs }
def job_rdcperm_nc_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test with permutations. No copula transformtation. Use median heuristic on the data. """ pdata = tr + te n_permute = 500 # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdcperm = it.RDCPerm(fmx, fmy, n_permute=n_permute, alpha=alpha, seed=r + 100, use_copula=False) rdcperm_result = rdcperm.perform_test(pdata) return { 'indtest': rdcperm, 'test_result': rdcperm_result, 'time_secs': t.secs }
def job_rdc_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test. - Gaussian width = median heuristic on the copula-transformed data - 10 random features for each X andY - Use full dataset for testing """ pdata = tr + te # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() # copula transform to both X and Y cop_map = fea.MarginalCDFMap() Xcdf = cop_map.gen_features(X) Ycdf = cop_map.gen_features(Y) medx = util.meddistance(Xcdf, subsample=1000) medy = util.meddistance(Ycdf, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdc = it.RDC(fmx, fmy, alpha=alpha) rdc_result = rdc.perform_test(pdata) return {'indtest': rdc, 'test_result': rdc_result, 'time_secs': t.secs}
def job_nfsic_grid(paired_source, tr, te, r): """ NFSIC where the test locations are randomized, and the Gaussian widths are optimized by a grid search. """ # randomize the test locations by fitting Gaussians to the data with util.ContextTimer() as t: V, W = it.GaussNFSIC.init_locs_2randn(tr, J, seed=r + 2) xtr, ytr = tr.xy() n_gwidth_cand = 30 gwidthx_factors = 2.0**np.linspace(-4, 4, n_gwidth_cand) gwidthy_factors = gwidthx_factors #gwidthy_factors = 2.0**np.linspace(-3, 4, 40) medx = util.meddistance(xtr, 1000) medy = util.meddistance(ytr, 1000) list_gwidthx = np.hstack(((medx**2) * gwidthx_factors)) list_gwidthy = np.hstack(((medy**2) * gwidthy_factors)) bestij, lambs = it.GaussNFSIC.grid_search_gwidth( tr, V, W, list_gwidthx, list_gwidthy) # These are width^2 best_widthx = list_gwidthx[bestij[0]] best_widthy = list_gwidthy[bestij[1]] # perform test nfsic_grid = it.GaussNFSIC(best_widthx, best_widthy, V, W, alpha) nfsic_grid_result = nfsic_grid.perform_test(te) return { 'indtest': nfsic_grid, 'test_result': nfsic_grid_result, 'time_secs': t.secs }
def job_nfsicJ10_stoopt(paired_source, tr, te, r, n_permute=None): J = 10 k, l = kl_kgauss_median(tr) medx2 = k.sigma2 medy2 = l.sigma2 fac_min = 5e-2 fac_max = 5e3 with util.ContextTimer() as t: nfsic_opt_options = { 'n_test_locs': J, 'max_iter': 100, 'V_step': 1, 'W_step': 1, 'gwidthx_step': 1, 'gwidthy_step': 1, 'batch_proportion': 1, 'tol_fun': 1e-4, 'step_pow': 0.5, 'seed': r + 2, 'reg': 1e-6, 'gwidthx_lb': medx2 * 1e-3, 'gwidthx_ub': medx2 * 1e3, 'gwidthy_lb': medy2 * 1e-3, 'gwidthy_ub': medy2 * 1e3 } op_V, op_W, op_gwx, op_gwy, info = it.GaussNFSIC.optimize_locs_widths( tr, alpha, **nfsic_opt_options) # make sure the optimized widths are not too extreme #last_gwx = info['gwidthxs'][-1] #last_gwy = info['gwidthys'][-1] #op_gwx = last_gwx #op_gwy = last_gwy op_gwx = max(fac_min * medx2, 1e-5, min(fac_max * medx2, op_gwx)) op_gwy = max(fac_min * medy2, 1e-5, min(fac_max * medy2, op_gwy)) nfsic_opt = it.GaussNFSIC(op_gwx, op_gwy, op_V, op_W, alpha=alpha, reg='auto', n_permute=n_permute, seed=r + 3) nfsic_opt_result = nfsic_opt.perform_test(te) return { 'indtest': nfsic_opt, 'test_result': nfsic_opt_result, 'time_secs': t.secs }
def job_qhsic_med(paired_source, tr, te, r): """ Quadratic-time HSIC using the permutation test. - Gaussian kernels. - No parameter selection procedure. Use the median heuristic for both X and Y. - Use full sample for testing. """ # use full sample for testing. Merge training and test sets pdata = tr + te n_permute = 300 with util.ContextTimer() as t: k, l = kl_kgauss_median(pdata) qhsic = it.QuadHSIC(k, l, n_permute, alpha=alpha, seed=r + 1) qhsic_result = qhsic.perform_test(pdata) return {'indtest': qhsic, 'test_result': qhsic_result, 'time_secs': t.secs}
def job_nfsic_med(paired_source, tr, te, r): """ NFSIC in which the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. """ pdata = tr + te with util.ContextTimer() as t: V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=r + 2) k, l = kl_kgauss_median(pdata) nfsic_med = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto') nfsic_med_result = nfsic_med.perform_test(pdata) return { #'indtest': nfsic_med, 'test_result': nfsic_med_result, 'time_secs': t.secs }
def compute(self): # randomly wait a few seconds so that multiple processes accessing the same # Theano function do not cause a lock problem. I do not know why. # I do not know if this does anything useful. # Sleep in seconds. time.sleep(np.random.rand(1) * 2) # load the data and construct a PairedSource here # The data can be big. We have to load it in this job function i.e., # each computing node loads by itself (no data passing). folder_path = self.folder_path prob_label = self.prob_label paired_source, _, is_h0 = exglo.get_problem_pickle( folder_path, prob_label + '.n0') n = self.n r = self.rep job_func = self.job_func pdata = paired_source.sample(n, seed=r) with util.ContextTimer() as t: logger.info("computing. %s. prob=%s, r=%d, n=%d" % (job_func.__name__, pdata.label, r, n)) tr, te = pdata.split_tr_te(tr_proportion=tr_proportion, seed=r + 21) prob_label = self.prob_label job_result = job_func(paired_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = job_func.__name__ logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, pdata.label, r, n, t.secs)) # save result fname = '%s-%s-r%d_n%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, r, n, alpha, tr_proportion) glo.ex_save_result(ex, job_result, prob_label, fname)
def job_nfsicJ10_cperm_stoopt(paired_source, tr, te, r): """ - Copula transform the data - Use permutations to simulate from the null distribution. """ n_permute = 500 with util.ContextTimer() as t: # copula transform to both X and Y cop_map = fea.MarginalCDFMap() xtr, ytr = tr.xy() xte, yte = te.xy() xtr = cop_map.gen_features(xtr) ytr = cop_map.gen_features(ytr) xte = cop_map.gen_features(xte) yte = cop_map.gen_features(yte) tr = data.PairedData(xtr, ytr) te = data.PairedData(xte, yte) to_return = job_nfsicJ10_stoopt(paired_source, tr, te, r, n_permute) to_return['time_secs'] = t.secs return to_return
def job_nfsicJ3_perm_stoopt(paired_source, tr, te, r): """ Use permutations to simulate from the null distribution. """ n_permute = 500 J = 3 with util.ContextTimer() as t: nfsic_opt_options = { 'n_test_locs': J, 'max_iter': 300, 'V_step': 1, 'W_step': 1, 'gwidthx_step': 1, 'gwidthy_step': 1, 'batch_proportion': 0.7, 'tol_fun': 1e-4, 'step_pow': 0.5, 'seed': r + 2, 'reg': 1e-6 } op_V, op_W, op_gwx, op_gwy, info = it.GaussNFSIC.optimize_locs_widths( tr, alpha, **nfsic_opt_options) nfsic_opt = it.GaussNFSIC(op_gwx, op_gwy, op_V, op_W, alpha, reg='auto', n_permute=n_permute, seed=r + 3) nfsic_opt_result = nfsic_opt.perform_test(te) return { 'indtest': nfsic_opt, 'test_result': nfsic_opt_result, 'time_secs': t.secs }