def run_full_MMD_test(x1, x2, y1, y2, alpha=0.01, output='stat'): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' tst_data = data.TSTData(y1, y2) tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10) xtr, ytr = tr.xy() xytr = tr.stack_xy() sig2 = general_utils.meddistance(xytr, subsample=1000) k = kernel_utils.KGauss(sig2) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=alpha) if output == 'stat': return mmd_test.compute_stat(te) if output == 'p_value': return mmd_test.compute_pvalue(te) if output == 'full': return mmd_test.perform_test(te)
def RHSIC(t1, y1, t2, y2, alpha=0.01, output='stat', opt_kernel=True): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' # rescale data max_y = max(np.concatenate((abs(y1.flatten()), abs(y2.flatten())))) y1 = y1 / max(abs(y1.flatten())) y2 = y2 / max(abs(y2.flatten())) sig = general_utils.meddistance(np.vstack((np.hstack( (t1, y1)), np.hstack((t2, y2)))), subsample=1000) # generate random features X, Y = general_utils.generate_random_features_ind(y1, y2, num_feat=20) w1, w2 = [len(t) for t in y1], [len(t) for t in y2] tst_data = data.TSTData(X, Y, w1, w2) tr, te = tst_data.split_tr_te(tr_proportion=0.5) xtr, ytr = tr.xy() # Compute median pairwise distance med_x = general_utils.meddistance(xtr, 1000) med_y = general_utils.meddistance(ytr, 1000) if opt_kernel == False: best_ker_x = kernel_utils.KGauss(med_x**2) best_ker_y = kernel_utils.KGauss(med_y**2) else: list_gwidth = np.hstack(((med_x**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_x = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] list_gwidth = np.hstack(((med_y**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_y = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width bestix, bestiy, _, _ = QuadHSICTest.grid_search_kernel( tr, list_kernels_x, list_kernels_y) best_ker_x = list_kernels_x[bestix] best_ker_y = list_kernels_y[bestiy] hsic_test = QuadHSICTest(best_ker_x, best_ker_y) if output == 'stat': return hsic_test.compute_stat(te) if output == 'p_value': return hsic_test.compute_pvalue(te, alpha=alpha) if output == 'full': return hsic_test.perform_test(te, alpha=alpha)
def Btest(wtst_data, B=None, output='stat'): """ B-Test is a fast maximum discrepancy (MMD) kernel two-sample test that has low sample complexity, tractable null distribution and is consistent. :param kernel: kernel that takes two samples and returns similarity matrix :param alpha: significance level :param B: number of blocks :return p-value for the hypothhesis of equal sample distribution """ # list of indeces for each block split_list_x1, split_list_x2 = BTest.block_indeces(wtst_data=wtst_data, B=B) X1, X2, Y1, Y2 = wtst_data.x1x2y1y2() out = 0 for indeces_1, indeces_2 in zip(split_list_x1, split_list_x2): tst_data = data.TSTData(Y1[indeces_1, :], Y2[indeces_2, :]) tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10) xtr, ytr = tr.xy() xytr = tr.stack_xy() sig2 = general_utils.meddistance(xytr, subsample=1000) k = kernel_utils.KGauss(sig2) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha=0.01) # perform test best_ker = list_kernels[besti] mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=0.01) if output == 'stat': out += mmd_test.compute_stat(te) if output == 'p_value': out += mmd_test.compute_pvalue(te) return out / len(split_list_x1)
def HSIC(t1, y1, t2, y2, alpha=0.01, output='stat', opt_kernel=True): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' # interpolate t1, X = interpolate(t1, y1) t2, Y = interpolate(t2, y2) w1, w2 = [len(t) for t in y1], [len(t) for t in y2] tst_data = data.TSTData(X, Y, w1, w2) tr, te = tst_data.split_tr_te(tr_proportion=0.5) xtr, ytr = tr.xy() # Compute median pairwise distance med_x = general_utils.meddistance(xtr, 1000) med_y = general_utils.meddistance(ytr, 1000) if opt_kernel == False: best_ker_x = kernel_utils.KGauss(med_x**2) best_ker_y = kernel_utils.KGauss(med_y**2) else: list_gwidth = np.hstack(((med_x**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_x = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] list_gwidth = np.hstack(((med_y**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_y = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width bestix, bestiy, _, _ = QuadHSICTest.grid_search_kernel( tr, list_kernels_x, list_kernels_y) best_ker_x = list_kernels_x[bestix] best_ker_y = list_kernels_y[bestiy] hsic_test = QuadHSICTest(best_ker_x, best_ker_y) if output == 'stat': return hsic_test.compute_stat(te) if output == 'p_value': return hsic_test.compute_pvalue(te, alpha=alpha) if output == 'full': return hsic_test.perform_test(te, alpha=alpha)
def RMMD(t1, y1, t2, y2, alpha=0.01, output='stat'): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' if t1 == None: # generate random features x1 = general_utils.generate_random_features_1d( y1, num_feat=100) + np.mean(y1) / np.var(y1) x2 = general_utils.generate_random_features_1d( y2, num_feat=100) + np.mean(y2) / np.var(y2) else: # generate random features x1 = general_utils.generate_random_features(t1, y1, num_feat=100) x2 = general_utils.generate_random_features(t2, y2, num_feat=100) # define training and testing sets w1, w2 = [len(t) for t in y1], [len(t) for t in y2] tst_data = data.TSTData(x1, x2, w1, w2) tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10) xtr, ytr = tr.xy() xytr = tr.stack_xy() #sig2 = general_utils.meddistance(xytr, subsample=1000) #k = kernel_utils.KGauss(sig2) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-3, 3, 10)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=alpha) if output == 'stat': return mmd_test.compute_stat(te) if output == 'p_value': return mmd_test.compute_pvalue(te) if output == 'full': return mmd_test.perform_test(te)