def run_full_MMD_test(x1, x2, y1, y2, alpha=0.01, output='stat'): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' tst_data = data.TSTData(y1, y2) tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10) xtr, ytr = tr.xy() xytr = tr.stack_xy() sig2 = general_utils.meddistance(xytr, subsample=1000) k = kernel_utils.KGauss(sig2) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=alpha) if output == 'stat': return mmd_test.compute_stat(te) if output == 'p_value': return mmd_test.compute_pvalue(te) if output == 'full': return mmd_test.perform_test(te)
def RHSIC(t1, y1, t2, y2, alpha=0.01, output='stat', opt_kernel=True): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' # rescale data max_y = max(np.concatenate((abs(y1.flatten()), abs(y2.flatten())))) y1 = y1 / max(abs(y1.flatten())) y2 = y2 / max(abs(y2.flatten())) sig = general_utils.meddistance(np.vstack((np.hstack( (t1, y1)), np.hstack((t2, y2)))), subsample=1000) # generate random features X, Y = general_utils.generate_random_features_ind(y1, y2, num_feat=20) w1, w2 = [len(t) for t in y1], [len(t) for t in y2] tst_data = data.TSTData(X, Y, w1, w2) tr, te = tst_data.split_tr_te(tr_proportion=0.5) xtr, ytr = tr.xy() # Compute median pairwise distance med_x = general_utils.meddistance(xtr, 1000) med_y = general_utils.meddistance(ytr, 1000) if opt_kernel == False: best_ker_x = kernel_utils.KGauss(med_x**2) best_ker_y = kernel_utils.KGauss(med_y**2) else: list_gwidth = np.hstack(((med_x**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_x = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] list_gwidth = np.hstack(((med_y**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_y = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width bestix, bestiy, _, _ = QuadHSICTest.grid_search_kernel( tr, list_kernels_x, list_kernels_y) best_ker_x = list_kernels_x[bestix] best_ker_y = list_kernels_y[bestiy] hsic_test = QuadHSICTest(best_ker_x, best_ker_y) if output == 'stat': return hsic_test.compute_stat(te) if output == 'p_value': return hsic_test.compute_pvalue(te, alpha=alpha) if output == 'full': return hsic_test.perform_test(te, alpha=alpha)
def Btest(wtst_data, B=None, output='stat'): """ B-Test is a fast maximum discrepancy (MMD) kernel two-sample test that has low sample complexity, tractable null distribution and is consistent. :param kernel: kernel that takes two samples and returns similarity matrix :param alpha: significance level :param B: number of blocks :return p-value for the hypothhesis of equal sample distribution """ # list of indeces for each block split_list_x1, split_list_x2 = BTest.block_indeces(wtst_data=wtst_data, B=B) X1, X2, Y1, Y2 = wtst_data.x1x2y1y2() out = 0 for indeces_1, indeces_2 in zip(split_list_x1, split_list_x2): tst_data = data.TSTData(Y1[indeces_1, :], Y2[indeces_2, :]) tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10) xtr, ytr = tr.xy() xytr = tr.stack_xy() sig2 = general_utils.meddistance(xytr, subsample=1000) k = kernel_utils.KGauss(sig2) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha=0.01) # perform test best_ker = list_kernels[besti] mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=0.01) if output == 'stat': out += mmd_test.compute_stat(te) if output == 'p_value': out += mmd_test.compute_pvalue(te) return out / len(split_list_x1)
def HSIC(t1, y1, t2, y2, alpha=0.01, output='stat', opt_kernel=True): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' # interpolate t1, X = interpolate(t1, y1) t2, Y = interpolate(t2, y2) w1, w2 = [len(t) for t in y1], [len(t) for t in y2] tst_data = data.TSTData(X, Y, w1, w2) tr, te = tst_data.split_tr_te(tr_proportion=0.5) xtr, ytr = tr.xy() # Compute median pairwise distance med_x = general_utils.meddistance(xtr, 1000) med_y = general_utils.meddistance(ytr, 1000) if opt_kernel == False: best_ker_x = kernel_utils.KGauss(med_x**2) best_ker_y = kernel_utils.KGauss(med_y**2) else: list_gwidth = np.hstack(((med_x**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_x = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] list_gwidth = np.hstack(((med_y**2) * (2.0**np.linspace(-1, 1, 5)))) list_gwidth.sort() list_kernels_y = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width bestix, bestiy, _, _ = QuadHSICTest.grid_search_kernel( tr, list_kernels_x, list_kernels_y) best_ker_x = list_kernels_x[bestix] best_ker_y = list_kernels_y[bestiy] hsic_test = QuadHSICTest(best_ker_x, best_ker_y) if output == 'stat': return hsic_test.compute_stat(te) if output == 'p_value': return hsic_test.compute_pvalue(te, alpha=alpha) if output == 'full': return hsic_test.perform_test(te, alpha=alpha)
def stat_comparisons_with_error(methods,param_name,params,size=300, mu = 0, var=1, dx=20, dy=20,output='stat'): ''' Code to compute full performance comparison runs methods: name of methods to be tested param: parameter vector to iterate over ''' stat_values = defaultdict(int) for param in params: # define which parameter to iterate over if param_name == 'mu': mu = param if param_name == 'var': var = param if param_name == 'dx': dx = param if param_name == 'dy': dy = param if param_name == 'prop': prop = param if param_name == 'size': size = param # Create a null and aternative version of the dataset. x1, x2, y1, y2 = data.generate_samples_random(size=size, mu = mu, var=var, dx=dx, dy=dy, noise ="gaussian",f1='linear', f2='linear') # Run the tests on both data sets and compute type I and II errors. for method in methods: method_name = method.__name__ key = 'method: {}; param value: {} '.format(method_name, param) tic = time.time() stat = method(x1, x2, y1, y2,output=output) toc = (time.time() - tic) / 2. stat_values[key] = stat key = 'KMM approx. error; param value: {} '.format(param) wtst_data = data.WTSTData(x1, x2, y1, y2) x1x2 = wtst_data.stack_x1x2() sig2x = meddistance(x1x2, subsample=1000) kx = kernel_utils.KGauss(sig2x) mmd_test = tst.WQuadMMDTest(kx, kx, n_permute=200, alpha=0.01) error = mmd_test.print_objective_KMM(x1, x2, kx, B=5) stat_values[key] = error return stat_values
def run_full_WMMD_test(x1, x2, y1, y2, alpha=0.01, output='stat'): ''' Runs full WMMD test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' require same number of instances in both populations ''' wtst_data = data.WTSTData(x1, x2, y1, y2) tr, te = wtst_data.split_tr_te(tr_proportion=0.5) y1y2 = tr.stack_y1y2() x1x2 = tr.stack_x1x2() sig2y = general_utils.meddistance(y1y2, subsample=1000) sig2x = general_utils.meddistance(x1x2, subsample=1000) #print(sig2y) #print(sig2x) k = kernel_utils.KGauss(sig2y) kx = kernel_utils.KGauss(sig2x) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_y1y2(), 1000) alpha = 0.01 list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = WQuadMMDTest.grid_search_kernel(tr, list_kernels, kx, alpha) # perform test best_ker = list_kernels[besti] mmd_test = WQuadMMDTest(best_ker, kx, n_permute=200, alpha=alpha) if output == 'stat': return mmd_test.compute_stat(te) if output == 'p_value': return mmd_test.compute_pvalue(te) if output == 'full': return mmd_test.perform_test(te)
def RMMD(t1, y1, t2, y2, alpha=0.01, output='stat'): ''' Runs full test with all optimization procedures included output: the desired output, one of 'stat', 'p_value', 'full' ''' if t1 == None: # generate random features x1 = general_utils.generate_random_features_1d( y1, num_feat=100) + np.mean(y1) / np.var(y1) x2 = general_utils.generate_random_features_1d( y2, num_feat=100) + np.mean(y2) / np.var(y2) else: # generate random features x1 = general_utils.generate_random_features(t1, y1, num_feat=100) x2 = general_utils.generate_random_features(t2, y2, num_feat=100) # define training and testing sets w1, w2 = [len(t) for t in y1], [len(t) for t in y2] tst_data = data.TSTData(x1, x2, w1, w2) tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10) xtr, ytr = tr.xy() xytr = tr.stack_xy() #sig2 = general_utils.meddistance(xytr, subsample=1000) #k = kernel_utils.KGauss(sig2) # choose the best parameter and perform a test with permutations med = general_utils.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-3, 3, 10)))) list_gwidth.sort() list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=alpha) if output == 'stat': return mmd_test.compute_stat(te) if output == 'p_value': return mmd_test.compute_pvalue(te) if output == 'full': return mmd_test.perform_test(te)
def test_KMM(): x = 11 * np.random.random(200) - 6.0 # x lies in [-6,5] y = x**2 + 10 * np.random.random(200) - 5 x1 = np.c_[x, y] x = 2 * np.random.random(100) - 6.0 # x lies in [-6,-4] y = x**2 + 10 * np.random.random(100) - 5 x2 = np.c_[x, y] x1x2 = np.vstack((x1, x2)) sig2 = general_utils.meddistance(x1x2, subsample=1000) print(sig2) k = kernel_utils.KGauss(sig2) coef = tst.WQuadMMDTest.kernel_mean_matching(x1, x2, k, B=10) plt.close() plt.figure() plt.scatter(x1[:, 0], x1[:, 1], color='black', marker='x') plt.scatter(x2[:, 0], x2[:, 1], color='red') plt.scatter(x1[:, 0], x1[:, 1], color='green', s=coef * 10, alpha=0.5) np.sum(coef > 1e-2)