def test_basic(self): """ Test basic things. Make sure SC_UME runs under normal usage. """ mp, varp = 4, 1 # q cannot be the true model. # That violates our assumption and the asymptotic null distribution # does not hold. mq, varq = 0.5, 1 # draw some data n = 2999 # sample size seed = 89 with util.NumpySeedContext(seed=seed): X = np.random.randn(n, 1)*varp**0.5 + mp Y = np.random.randn(n, 1)*varq**0.5 + mq Z = np.random.randn(n, 1) datap = data.Data(X) dataq = data.Data(Y) datar = data.Data(Z) # hyperparameters of the test medxz = util.meddistance(np.vstack((X, Z)), subsample=1000) medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000) k = kernel.KGauss(sigma2=medxz**2) l = kernel.KGauss(sigma2=medyz**2) # 2 sets of test locations J = 3 Jp = J Jq = J V = util.fit_gaussian_draw(X, Jp, seed=seed+2) W = util.fit_gaussian_draw(Y, Jq, seed=seed+3) # construct a UME test alpha = 0.01 # significance level scume = mct.SC_UME(datap, dataq, k, l, V, W, alpha=alpha) test_result = scume.perform_test(datar) # make sure it rejects #print(test_result) assert test_result['h0_rejected']
def met_gumeJ1_2V_rand(P, Q, data_source, n, r, J=1, use_1set_locs=False): """ UME-based three-sample test. * Use J=1 test location by default. * Use two sets (2V) of test locations by default: V and W, each having J locations. Will constrain V=W if use_1set_locs=True. * The test locations are selected at random from the data. Selected points are removed for testing. * Gaussian kernels for the two UME statistics. Median heuristic is used to select each width. """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} assert J >= 1 ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: # remove the first J points from each set X, Y, Z = datp.data(), datq.data(), datr.data() # containing 3*J points pool3J = np.vstack((X[:J, :], Y[:J, :], Z[:J, :])) X, Y, Z = (X[J:, :], Y[J:, :], Z[J:, :]) datp, datq, datr = [data.Data(a) for a in [X, Y, Z]] assert X.shape[0] == Y.shape[0] assert Y.shape[0] == Z.shape[0] assert Z.shape[0] == n - J assert datp.sample_size() == n - J assert datq.sample_size() == n - J assert datr.sample_size() == n - J #XYZ = np.vstack((X, Y, Z)) #stds = np.std(util.subsample_rows(XYZ, min(n-3*J, 500), # seed=r+87), axis=0) d = X.shape[1] # add a little noise to the locations. with util.NumpySeedContext(seed=r * 191): #pool3J = pool3J + np.random.randn(3*J, d)*np.max(stds)*3 pool3J = np.random.randn(3 * J, d) * 2 # median heuristic to set the Gaussian widths medxz = util.meddistance(np.vstack((X, Z)), subsample=1000) medyz = util.meddistance(np.vstack((Z, Y)), subsample=1000) if use_1set_locs: # randomly select J points from the pool3J for the J test locations #V = util.subsample_rows(pool3J, J, r) V = pool3J[:J, :] W = V k = kernel.KGauss(sigma2=np.mean([medxz, medyz])**2) l = k else: # use two sets of locations: V and W #VW = util.subsample_rows(pool3J, 2*J, r) VW = pool3J[:2 * J, :] V = VW[:J, :] W = VW[J:, :] # 2 Gaussian kernels k = kernel.KGauss(sigma2=medxz**2) l = kernel.KGauss(sigma2=medyz**2) # construct the test scume = mct.SC_UME(datp, datq, k, l, V, W, alpha=alpha) scume_rand_result = scume.perform_test(datr) return { # This key "test" can be removed. Storing V, W can take quite a lot # of space, especially when the input dimension d is high. #'test':scume, 'test_result': scume_rand_result, 'time_secs': t.secs }
def met_gumeJ1_3sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5): """ UME-based three-sample test * Use J=1 test location by default (in the set V=W). * 3sopt = optimize the test locations by maximizing the 3-sample test's power criterion. There is only one set of test locations. * One Gaussian kernel for the two UME statistics. Optimize the Gaussian width """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} assert J >= 1 ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: # split the data into training/test sets [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \ [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]] Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]] Xyztr = np.vstack((Xtr, Ytr, Ztr)) # initialize optimization parameters. # Initialize the Gaussian widths with the median heuristic medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000) medyz = util.meddistance(np.vstack((Ztr, Ytr)), subsample=1000) gwidth0 = np.mean([medxz, medyz])**2 # pick a subset of points in the training set for V, W V0 = util.subsample_rows(Xyztr, J, seed=r + 2) # optimization options opt_options = { 'max_iter': 100, 'reg': 1e-6, 'tol_fun': 1e-7, 'locs_bounds_frac': 50, 'gwidth_lb': 0.1, 'gwidth_ub': 6**2, } V_opt, gw2_opt, opt_result = mct.SC_GaussUME.optimize_3sample_criterion( datptr, datqtr, datrtr, V0, gwidth0, **opt_options) k_opt = kernel.KGauss(gw2_opt) # construct a UME test scume_opt3 = mct.SC_UME(datpte, datqte, k_opt, k_opt, V_opt, V_opt, alpha=alpha) scume_opt3_result = scume_opt3.perform_test(datrte) return { # This key "test" can be removed. Storing V, W can take quite a lot # of space, especially when the input dimension d is high. #'test':scume, 'test_result': scume_opt3_result, 'time_secs': t.secs }
def met_gumeJ1_2sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5): """ UME-based three-sample test * Use J=1 test location by default. * 2sopt = optimize the two sets of test locations by maximizing the 2-sample test's power criterion. Each set is optmized separately. * Gaussian kernels for the two UME statistics. The Gaussian widths are also optimized separately. """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} assert J >= 1 ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: # split the data into training/test sets [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \ [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]] Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]] # initialize optimization parameters. # Initialize the Gaussian widths with the median heuristic medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000) medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000) gwidth0p = medxz**2 gwidth0q = medyz**2 # numbers of test locations in V, W Jp = J Jq = J # pick a subset of points in the training set for V, W Xyztr = np.vstack((Xtr, Ytr, Ztr)) VW = util.subsample_rows(Xyztr, Jp + Jq, seed=r + 1) V0 = VW[:Jp, :] W0 = VW[Jp:, :] # optimization options opt_options = { 'max_iter': 100, 'reg': 1e-4, 'tol_fun': 1e-6, 'locs_bounds_frac': 50, 'gwidth_lb': 0.1, 'gwidth_ub': 10**2, } umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths( datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, **opt_options) (V_opt, gw2p_opt, opt_infop) = umep_params (W_opt, gw2q_opt, opt_infoq) = umeq_params k_opt = kernel.KGauss(gw2p_opt) l_opt = kernel.KGauss(gw2q_opt) # construct a UME test scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha) scume_opt2_result = scume_opt2.perform_test(datrte) return { # This key "test" can be removed. Storing V, W can take quite a lot # of space, especially when the input dimension d is high. #'test':scume, 'test_result': scume_opt2_result, 'time_secs': t.secs }
def test_optimize_2sets_locs_widths(self): mp, varp = 2, 1 # q cannot be the true model. # That violates our assumption and the asymptotic null distribution # does not hold. mq, varq = 1, 1 # draw some data n = 800 # sample size seed = 6 with util.NumpySeedContext(seed=seed): X = np.random.randn(n, 1)*varp**0.5 + mp Y = np.random.randn(n, 1)*varq**0.5 + mq Z = np.random.randn(n, 1) datap = data.Data(X) dataq = data.Data(Y) datar = data.Data(Z) # split the data into training/test sets [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \ [D.split_tr_te(tr_proportion=0.3, seed=85) for D in [datap, dataq, datar]] Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]] # initialize optimization parameters. # Initialize the Gaussian widths with the median heuristic medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000) medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000) gwidth0p = medxz**2 gwidth0q = medyz**2 # numbers of test locations in V, W J = 2 Jp = J Jq = J # pick a subset of points in the training set for V, W Xyztr = np.vstack((Xtr, Ytr, Ztr)) VW = util.subsample_rows(Xyztr, Jp+Jq, seed=73) V0 = VW[:Jp, :] W0 = VW[Jp:, :] # optimization options opt_options = { 'max_iter': 100, 'reg': 1e-4, 'tol_fun': 1e-6, 'locs_bounds_frac': 100, 'gwidth_lb': None, 'gwidth_ub': None, } umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths( datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, **opt_options) (V_opt, gw2p_opt, opt_infop) = umep_params (W_opt, gw2q_opt, opt_infoq) = umeq_params k_opt = kernel.KGauss(gw2p_opt) l_opt = kernel.KGauss(gw2q_opt) # construct a UME test alpha = 0.01 # significance level scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha) scume_opt2.perform_test(datrte)