def ume_test(X, Y, Z, V, alpha=0.01, mode='mean'): """ Perform a UME three-sample test. All the data are assumed to be preprocessed. Args: - X: n x d ndarray, a sample from P - Y: n x d ndarray, a sample from Q - Z: n x d ndarray, a sample from R - V: J x d ndarray, a set of J test locations - alpha: a user specified significance level Returns: - a dictionary of the form { alpha: 0.01, pvalue: 0.0002, test_stat: 2.3, h0_rejected: True, time_secs: ... } """ if mode == 'mean': mean_medxyz2 = SC_MMD.median_heuristic_bounliphone(X, Y, Z, subsample=1000) gwidth = mean_medxyz2 else: XYZ = np.vstack((X, Y, Z)) med2 = util.meddistance(XYZ, subsample=1000)**2 gwidth = med2 k = kernel.KGauss(gwidth) scume = SC_UME(data.Data(X), data.Data(Y), k, k, V, V, alpha) return scume.perform_test(data.Data(Z))
def obj(sqrt_gwidth, V): gwidth2 = sqrt_gwidth**2 k = kernel.KGauss(gwidth2) if added_obj is None: return -DC_FSSD.power_criterion( p, q, datar, k, k, V, V, reg=reg) else: return -(DC_FSSD.power_criterion( p, q, datar, k, k, V, V, reg=reg) + added_obj(gwidth2, V))
def test_basic(self): """ Test basic things. Make sure SC_UME runs under normal usage. """ mp, varp = 4, 1 # q cannot be the true model. # That violates our assumption and the asymptotic null distribution # does not hold. mq, varq = 0.5, 1 # draw some data n = 2999 # sample size seed = 89 with util.NumpySeedContext(seed=seed): X = np.random.randn(n, 1)*varp**0.5 + mp Y = np.random.randn(n, 1)*varq**0.5 + mq Z = np.random.randn(n, 1) datap = data.Data(X) dataq = data.Data(Y) datar = data.Data(Z) # hyperparameters of the test medxz = util.meddistance(np.vstack((X, Z)), subsample=1000) medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000) k = kernel.KGauss(sigma2=medxz**2) l = kernel.KGauss(sigma2=medyz**2) # 2 sets of test locations J = 3 Jp = J Jq = J V = util.fit_gaussian_draw(X, Jp, seed=seed+2) W = util.fit_gaussian_draw(Y, Jq, seed=seed+3) # construct a UME test alpha = 0.01 # significance level scume = mct.SC_UME(datap, dataq, k, l, V, W, alpha=alpha) test_result = scume.perform_test(datar) # make sure it rejects #print(test_result) assert test_result['h0_rejected']
def __init__(self, p, q, gwidth2p, gwidth2q, V, W, alpha=0.01): """ :param p: a kmod.density.UnnormalizedDensity (model 1) :param q: a kmod.density.UnnormalizedDensity (model 2) :param gwidth0p: squared Gaussian width for the kernel k in FSSD(p, k, V) :param gwidth0q: squared Gaussian width for the kernel l in FSSD(q, l, W) :param V: Jp x d numpy array of Jp test locations used in FSSD(p, k, V) :param W: Jq x d numpy array of Jq test locations used in FSSD(q, l, W) :param alpha: significance level of the test """ if not util.is_real_num(gwidth2p) or gwidth2p <= 0: raise ValueError( 'gwidth2p must be positive real. Was {}'.format(gwidth2p)) if not util.is_real_num(gwidth2q) or gwidth2q <= 0: raise ValueError( 'gwidth2q must be positive real. Was {}'.format(gwidth2q)) k = kernel.KGauss(gwidth2p) l = kernel.KGauss(gwidth2q) super(DC_GaussFSSD, self).__init__(p, q, k, l, V, W, alpha)
def __init__(self, datap, dataq, gwidth2p, gwidth2q, V, W, alpha=0.01): """ :param datap: a kmod.data.Data object representing an i.i.d. sample X (from model 1) :param dataq: a kmod.data.Data object representing an i.i.d. sample Y (from model 2) :param gwidth2p: squared Gaussian width for UME(P, R) :param gwidth2q: squared Gaussian width for UME(Q, R) :param V: Jp x d numpy array of Jp test locations used in UME(p, r) :param W: Jq x d numpy array of Jq test locations used in UME(q, r) :param alpha: significance level of the test """ if not util.is_real_num(gwidth2p) or gwidth2p <= 0: raise ValueError( 'gwidth2p must be positive real. Was {}'.format(gwidth2p)) if not util.is_real_num(gwidth2q) or gwidth2q <= 0: raise ValueError( 'gwidth2q must be positive real. Was {}'.format(gwidth2q)) k = kernel.KGauss(gwidth2p) l = kernel.KGauss(gwidth2q) super(SC_GaussUME, self).__init__(datap, dataq, k, l, V, W, alpha)
def met_gmmd_med(P, Q, data_source, n, r): """ Use met_gmmd_med_bounliphone(). It uses the median heuristic following Bounliphone et al., 2016. Bounliphone et al., 2016's MMD-based 3-sample test. * Gaussian kernel. * Gaussian width = mean of (median heuristic on (X, Z), median heuristic on (Y, Z)) * Use full sample for testing (no holding out for optimization) """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: X, Y, Z = datp.data(), datq.data(), datr.data() # hyperparameters of the test medxz = util.meddistance(np.vstack((X, Z)), subsample=1000) medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000) medxyz = np.mean([medxz, medyz]) k = kernel.KGauss(sigma2=medxyz**2) scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha) scmmd_result = scmmd.perform_test(datr) return { # This key "test" can be removed. #'test': scmmd, 'test_result': scmmd_result, 'time_secs': t.secs }
def met_gmmd_med_bounliphone(P, Q, data_source, n, r): """ Bounliphone et al., 2016's MMD-based 3-sample test. * Gaussian kernel. * Gaussian width = chosen as described in https://github.com/wbounliphone/relative_similarity_test/blob/4884786aa3fe0f41b3ee76c9587de535a6294aee/relativeSimilarityTest_finalversion.m * Use full sample for testing (no holding out for optimization) """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: X, Y, Z = datp.data(), datq.data(), datr.data() med2 = mct.SC_MMD.median_heuristic_bounliphone(X, Y, Z, subsample=1000, seed=r + 3) k = kernel.KGauss(sigma2=med2) scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha) scmmd_result = scmmd.perform_test(datr) return { # This key "test" can be removed. # 'test': scmmd, 'test_result': scmmd_result, 'time_secs': t.secs }
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 13 # sample n = 103 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 p = density.IsotropicNormal(mean, variance) q = density.IsotropicNormal(mean, variance+3) # only one dimension of the mean is shifted #draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean +0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed)*np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed+1) W = util.fit_gaussian_draw(X, J, seed=seed+8) mcfssd = mct.DC_FSSD(p, q, k, k, V, W, alpha=0.01) s = mcfssd.compute_stat(dat) s2, var = mcfssd.get_H1_mean_variance(dat) tresult = mcfssd.perform_test(dat) # assertions self.assertGreaterEqual(tresult['pvalue'], 0) self.assertLessEqual(tresult['pvalue'], 1) testing.assert_approx_equal(s, (n**0.5)*s2)
def met_gumeJ1_2V_rand(P, Q, data_source, n, r, J=1, use_1set_locs=False): """ UME-based three-sample test. * Use J=1 test location by default. * Use two sets (2V) of test locations by default: V and W, each having J locations. Will constrain V=W if use_1set_locs=True. * The test locations are selected at random from the data. Selected points are removed for testing. * Gaussian kernels for the two UME statistics. Median heuristic is used to select each width. """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} assert J >= 1 ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: # remove the first J points from each set X, Y, Z = datp.data(), datq.data(), datr.data() # containing 3*J points pool3J = np.vstack((X[:J, :], Y[:J, :], Z[:J, :])) X, Y, Z = (X[J:, :], Y[J:, :], Z[J:, :]) datp, datq, datr = [data.Data(a) for a in [X, Y, Z]] assert X.shape[0] == Y.shape[0] assert Y.shape[0] == Z.shape[0] assert Z.shape[0] == n - J assert datp.sample_size() == n - J assert datq.sample_size() == n - J assert datr.sample_size() == n - J #XYZ = np.vstack((X, Y, Z)) #stds = np.std(util.subsample_rows(XYZ, min(n-3*J, 500), # seed=r+87), axis=0) d = X.shape[1] # add a little noise to the locations. with util.NumpySeedContext(seed=r * 191): #pool3J = pool3J + np.random.randn(3*J, d)*np.max(stds)*3 pool3J = np.random.randn(3 * J, d) * 2 # median heuristic to set the Gaussian widths medxz = util.meddistance(np.vstack((X, Z)), subsample=1000) medyz = util.meddistance(np.vstack((Z, Y)), subsample=1000) if use_1set_locs: # randomly select J points from the pool3J for the J test locations #V = util.subsample_rows(pool3J, J, r) V = pool3J[:J, :] W = V k = kernel.KGauss(sigma2=np.mean([medxz, medyz])**2) l = k else: # use two sets of locations: V and W #VW = util.subsample_rows(pool3J, 2*J, r) VW = pool3J[:2 * J, :] V = VW[:J, :] W = VW[J:, :] # 2 Gaussian kernels k = kernel.KGauss(sigma2=medxz**2) l = kernel.KGauss(sigma2=medyz**2) # construct the test scume = mct.SC_UME(datp, datq, k, l, V, W, alpha=alpha) scume_rand_result = scume.perform_test(datr) return { # This key "test" can be removed. Storing V, W can take quite a lot # of space, especially when the input dimension d is high. #'test':scume, 'test_result': scume_rand_result, 'time_secs': t.secs }
def met_gumeJ1_3sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5): """ UME-based three-sample test * Use J=1 test location by default (in the set V=W). * 3sopt = optimize the test locations by maximizing the 3-sample test's power criterion. There is only one set of test locations. * One Gaussian kernel for the two UME statistics. Optimize the Gaussian width """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} assert J >= 1 ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: # split the data into training/test sets [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \ [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]] Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]] Xyztr = np.vstack((Xtr, Ytr, Ztr)) # initialize optimization parameters. # Initialize the Gaussian widths with the median heuristic medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000) medyz = util.meddistance(np.vstack((Ztr, Ytr)), subsample=1000) gwidth0 = np.mean([medxz, medyz])**2 # pick a subset of points in the training set for V, W V0 = util.subsample_rows(Xyztr, J, seed=r + 2) # optimization options opt_options = { 'max_iter': 100, 'reg': 1e-6, 'tol_fun': 1e-7, 'locs_bounds_frac': 50, 'gwidth_lb': 0.1, 'gwidth_ub': 6**2, } V_opt, gw2_opt, opt_result = mct.SC_GaussUME.optimize_3sample_criterion( datptr, datqtr, datrtr, V0, gwidth0, **opt_options) k_opt = kernel.KGauss(gw2_opt) # construct a UME test scume_opt3 = mct.SC_UME(datpte, datqte, k_opt, k_opt, V_opt, V_opt, alpha=alpha) scume_opt3_result = scume_opt3.perform_test(datrte) return { # This key "test" can be removed. Storing V, W can take quite a lot # of space, especially when the input dimension d is high. #'test':scume, 'test_result': scume_opt3_result, 'time_secs': t.secs }
def met_gumeJ1_2sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5): """ UME-based three-sample test * Use J=1 test location by default. * 2sopt = optimize the two sets of test locations by maximizing the 2-sample test's power criterion. Each set is optmized separately. * Gaussian kernels for the two UME statistics. The Gaussian widths are also optimized separately. """ if not P.has_datasource() or not Q.has_datasource(): # Not applicable. Return {}. return {} assert J >= 1 ds_p = P.get_datasource() ds_q = Q.get_datasource() # sample some data datp, datq, datr = sample_pqr(ds_p, ds_q, data_source, n, r, only_from_r=False) # Start the timer here with util.ContextTimer() as t: # split the data into training/test sets [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \ [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]] Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]] # initialize optimization parameters. # Initialize the Gaussian widths with the median heuristic medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000) medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000) gwidth0p = medxz**2 gwidth0q = medyz**2 # numbers of test locations in V, W Jp = J Jq = J # pick a subset of points in the training set for V, W Xyztr = np.vstack((Xtr, Ytr, Ztr)) VW = util.subsample_rows(Xyztr, Jp + Jq, seed=r + 1) V0 = VW[:Jp, :] W0 = VW[Jp:, :] # optimization options opt_options = { 'max_iter': 100, 'reg': 1e-4, 'tol_fun': 1e-6, 'locs_bounds_frac': 50, 'gwidth_lb': 0.1, 'gwidth_ub': 10**2, } umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths( datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, **opt_options) (V_opt, gw2p_opt, opt_infop) = umep_params (W_opt, gw2q_opt, opt_infoq) = umeq_params k_opt = kernel.KGauss(gw2p_opt) l_opt = kernel.KGauss(gw2q_opt) # construct a UME test scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha) scume_opt2_result = scume_opt2.perform_test(datrte) return { # This key "test" can be removed. Storing V, W can take quite a lot # of space, especially when the input dimension d is high. #'test':scume, 'test_result': scume_opt2_result, 'time_secs': t.secs }
def obj_feat_space(sqrt_gwidth, V): k = kernel.KGauss(sqrt_gwidth**2) return -SC_UME.power_criterion( datap, dataq, datar, k, k, V, V, reg=reg)
def test_optimize_2sets_locs_widths(self): mp, varp = 2, 1 # q cannot be the true model. # That violates our assumption and the asymptotic null distribution # does not hold. mq, varq = 1, 1 # draw some data n = 800 # sample size seed = 6 with util.NumpySeedContext(seed=seed): X = np.random.randn(n, 1)*varp**0.5 + mp Y = np.random.randn(n, 1)*varq**0.5 + mq Z = np.random.randn(n, 1) datap = data.Data(X) dataq = data.Data(Y) datar = data.Data(Z) # split the data into training/test sets [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \ [D.split_tr_te(tr_proportion=0.3, seed=85) for D in [datap, dataq, datar]] Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]] # initialize optimization parameters. # Initialize the Gaussian widths with the median heuristic medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000) medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000) gwidth0p = medxz**2 gwidth0q = medyz**2 # numbers of test locations in V, W J = 2 Jp = J Jq = J # pick a subset of points in the training set for V, W Xyztr = np.vstack((Xtr, Ytr, Ztr)) VW = util.subsample_rows(Xyztr, Jp+Jq, seed=73) V0 = VW[:Jp, :] W0 = VW[Jp:, :] # optimization options opt_options = { 'max_iter': 100, 'reg': 1e-4, 'tol_fun': 1e-6, 'locs_bounds_frac': 100, 'gwidth_lb': None, 'gwidth_ub': None, } umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths( datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, **opt_options) (V_opt, gw2p_opt, opt_infop) = umep_params (W_opt, gw2q_opt, opt_infoq) = umeq_params k_opt = kernel.KGauss(gw2p_opt) l_opt = kernel.KGauss(gw2q_opt) # construct a UME test alpha = 0.01 # significance level scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha) scume_opt2.perform_test(datrte)