Exemple #1
0
def run_full_MMD_test(x1, x2, y1, y2, alpha=0.01, output='stat'):
    '''
    Runs full test with all optimization procedures included
    output: the desired output, one of 'stat', 'p_value', 'full'
    '''
    tst_data = data.TSTData(y1, y2)
    tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
    xtr, ytr = tr.xy()
    xytr = tr.stack_xy()
    sig2 = general_utils.meddistance(xytr, subsample=1000)
    k = kernel_utils.KGauss(sig2)

    # choose the best parameter and perform a test with permutations
    med = general_utils.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20))))
    list_gwidth.sort()

    list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

    # grid search to choose the best Gaussian width
    besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha)
    # perform test
    best_ker = list_kernels[besti]

    mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=alpha)
    if output == 'stat':
        return mmd_test.compute_stat(te)
    if output == 'p_value':
        return mmd_test.compute_pvalue(te)
    if output == 'full':
        return mmd_test.perform_test(te)
Exemple #2
0
def RHSIC(t1, y1, t2, y2, alpha=0.01, output='stat', opt_kernel=True):
    '''
    Runs full test with all optimization procedures included
    output: the desired output, one of 'stat', 'p_value', 'full'
    '''
    # rescale data
    max_y = max(np.concatenate((abs(y1.flatten()), abs(y2.flatten()))))
    y1 = y1 / max(abs(y1.flatten()))
    y2 = y2 / max(abs(y2.flatten()))

    sig = general_utils.meddistance(np.vstack((np.hstack(
        (t1, y1)), np.hstack((t2, y2)))),
                                    subsample=1000)

    # generate random features
    X, Y = general_utils.generate_random_features_ind(y1, y2, num_feat=20)

    w1, w2 = [len(t) for t in y1], [len(t) for t in y2]
    tst_data = data.TSTData(X, Y, w1, w2)
    tr, te = tst_data.split_tr_te(tr_proportion=0.5)
    xtr, ytr = tr.xy()

    # Compute median pairwise distance
    med_x = general_utils.meddistance(xtr, 1000)
    med_y = general_utils.meddistance(ytr, 1000)

    if opt_kernel == False:
        best_ker_x = kernel_utils.KGauss(med_x**2)
        best_ker_y = kernel_utils.KGauss(med_y**2)
    else:
        list_gwidth = np.hstack(((med_x**2) * (2.0**np.linspace(-1, 1, 5))))
        list_gwidth.sort()
        list_kernels_x = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

        list_gwidth = np.hstack(((med_y**2) * (2.0**np.linspace(-1, 1, 5))))
        list_gwidth.sort()
        list_kernels_y = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

        # grid search to choose the best Gaussian width
        bestix, bestiy, _, _ = QuadHSICTest.grid_search_kernel(
            tr, list_kernels_x, list_kernels_y)

        best_ker_x = list_kernels_x[bestix]
        best_ker_y = list_kernels_y[bestiy]

    hsic_test = QuadHSICTest(best_ker_x, best_ker_y)

    if output == 'stat':
        return hsic_test.compute_stat(te)
    if output == 'p_value':
        return hsic_test.compute_pvalue(te, alpha=alpha)
    if output == 'full':
        return hsic_test.perform_test(te, alpha=alpha)
Exemple #3
0
    def Btest(wtst_data, B=None, output='stat'):
        """
        B-Test is a fast maximum discrepancy (MMD) kernel two-sample test that has low sample complexity,
        tractable null distribution and is consistent.

        :param kernel: kernel that takes two samples and returns similarity matrix
        :param alpha: significance level
        :param B: number of blocks
        :return p-value for the hypothhesis of equal sample distribution
        """

        # list of indeces for each block
        split_list_x1, split_list_x2 = BTest.block_indeces(wtst_data=wtst_data,
                                                           B=B)
        X1, X2, Y1, Y2 = wtst_data.x1x2y1y2()
        out = 0

        for indeces_1, indeces_2 in zip(split_list_x1, split_list_x2):

            tst_data = data.TSTData(Y1[indeces_1, :], Y2[indeces_2, :])
            tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
            xtr, ytr = tr.xy()
            xytr = tr.stack_xy()
            sig2 = general_utils.meddistance(xytr, subsample=1000)
            k = kernel_utils.KGauss(sig2)

            # choose the best parameter and perform a test with permutations
            med = general_utils.meddistance(tr.stack_xy(), 1000)
            list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20))))
            list_gwidth.sort()

            list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

            # grid search to choose the best Gaussian width
            besti, powers = QuadMMDTest.grid_search_kernel(tr,
                                                           list_kernels,
                                                           alpha=0.01)
            # perform test
            best_ker = list_kernels[besti]

            mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=0.01)

            if output == 'stat':
                out += mmd_test.compute_stat(te)

            if output == 'p_value':
                out += mmd_test.compute_pvalue(te)

        return out / len(split_list_x1)
Exemple #4
0
def HSIC(t1, y1, t2, y2, alpha=0.01, output='stat', opt_kernel=True):
    '''
    Runs full test with all optimization procedures included
    output: the desired output, one of 'stat', 'p_value', 'full'
    '''

    # interpolate
    t1, X = interpolate(t1, y1)
    t2, Y = interpolate(t2, y2)

    w1, w2 = [len(t) for t in y1], [len(t) for t in y2]
    tst_data = data.TSTData(X, Y, w1, w2)
    tr, te = tst_data.split_tr_te(tr_proportion=0.5)
    xtr, ytr = tr.xy()

    # Compute median pairwise distance
    med_x = general_utils.meddistance(xtr, 1000)
    med_y = general_utils.meddistance(ytr, 1000)

    if opt_kernel == False:
        best_ker_x = kernel_utils.KGauss(med_x**2)
        best_ker_y = kernel_utils.KGauss(med_y**2)
    else:
        list_gwidth = np.hstack(((med_x**2) * (2.0**np.linspace(-1, 1, 5))))
        list_gwidth.sort()
        list_kernels_x = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

        list_gwidth = np.hstack(((med_y**2) * (2.0**np.linspace(-1, 1, 5))))
        list_gwidth.sort()
        list_kernels_y = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

        # grid search to choose the best Gaussian width
        bestix, bestiy, _, _ = QuadHSICTest.grid_search_kernel(
            tr, list_kernels_x, list_kernels_y)

        best_ker_x = list_kernels_x[bestix]
        best_ker_y = list_kernels_y[bestiy]

    hsic_test = QuadHSICTest(best_ker_x, best_ker_y)

    if output == 'stat':
        return hsic_test.compute_stat(te)
    if output == 'p_value':
        return hsic_test.compute_pvalue(te, alpha=alpha)
    if output == 'full':
        return hsic_test.perform_test(te, alpha=alpha)
Exemple #5
0
def stat_comparisons_with_error(methods,param_name,params,size=300, mu = 0, var=1, dx=20, dy=20,output='stat'):
    '''
    Code to compute full performance comparison runs
    
    methods: name of methods to be tested
    param: parameter vector to iterate over
    '''
    
    stat_values = defaultdict(int)

    for param in params:
                
        # define which parameter to iterate over
        if param_name == 'mu':
            mu = param
        if param_name == 'var':
            var = param
        if param_name == 'dx':
            dx = param
        if param_name == 'dy':
            dy = param
        if param_name == 'prop':
            prop = param
        if param_name == 'size':
            size = param
            
        # Create a null and aternative version of the dataset.
        x1, x2, y1, y2 = data.generate_samples_random(size=size, mu = mu, var=var, dx=dx, dy=dy, 
                                                      noise ="gaussian",f1='linear', f2='linear')        

            # Run the tests on both data sets and compute type I and II errors.
        for method in methods:
            method_name = method.__name__
            key = 'method: {}; param value: {} '.format(method_name, param)
            tic = time.time()
            stat = method(x1, x2, y1, y2,output=output)
            toc = (time.time() - tic) / 2.

            stat_values[key] = stat
            
        key = 'KMM approx. error; param value: {} '.format(param)
        wtst_data = data.WTSTData(x1, x2, y1, y2)
        x1x2 = wtst_data.stack_x1x2()
        sig2x = meddistance(x1x2, subsample=1000)
        kx = kernel_utils.KGauss(sig2x)
        mmd_test = tst.WQuadMMDTest(kx, kx, n_permute=200, alpha=0.01)
        error = mmd_test.print_objective_KMM(x1, x2, kx, B=5)
        
        stat_values[key] = error
        
    return stat_values
Exemple #6
0
def run_full_WMMD_test(x1, x2, y1, y2, alpha=0.01, output='stat'):
    '''
    Runs full WMMD test with all optimization procedures included
    output: the desired output, one of 'stat', 'p_value', 'full'
    require same number of instances in both populations
    '''
    wtst_data = data.WTSTData(x1, x2, y1, y2)
    tr, te = wtst_data.split_tr_te(tr_proportion=0.5)
    y1y2 = tr.stack_y1y2()
    x1x2 = tr.stack_x1x2()
    sig2y = general_utils.meddistance(y1y2, subsample=1000)
    sig2x = general_utils.meddistance(x1x2, subsample=1000)
    #print(sig2y)
    #print(sig2x)
    k = kernel_utils.KGauss(sig2y)
    kx = kernel_utils.KGauss(sig2x)

    # choose the best parameter and perform a test with permutations
    med = general_utils.meddistance(tr.stack_y1y2(), 1000)
    alpha = 0.01
    list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 20))))
    list_gwidth.sort()

    list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

    # grid search to choose the best Gaussian width
    besti, powers = WQuadMMDTest.grid_search_kernel(tr, list_kernels, kx,
                                                    alpha)
    # perform test
    best_ker = list_kernels[besti]

    mmd_test = WQuadMMDTest(best_ker, kx, n_permute=200, alpha=alpha)
    if output == 'stat':
        return mmd_test.compute_stat(te)
    if output == 'p_value':
        return mmd_test.compute_pvalue(te)
    if output == 'full':
        return mmd_test.perform_test(te)
Exemple #7
0
def RMMD(t1, y1, t2, y2, alpha=0.01, output='stat'):
    '''
    Runs full test with all optimization procedures included
    output: the desired output, one of 'stat', 'p_value', 'full'
    '''

    if t1 == None:
        # generate random features
        x1 = general_utils.generate_random_features_1d(
            y1, num_feat=100) + np.mean(y1) / np.var(y1)
        x2 = general_utils.generate_random_features_1d(
            y2, num_feat=100) + np.mean(y2) / np.var(y2)
    else:
        # generate random features
        x1 = general_utils.generate_random_features(t1, y1, num_feat=100)
        x2 = general_utils.generate_random_features(t2, y2, num_feat=100)

    # define training and testing sets
    w1, w2 = [len(t) for t in y1], [len(t) for t in y2]
    tst_data = data.TSTData(x1, x2, w1, w2)
    tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
    xtr, ytr = tr.xy()
    xytr = tr.stack_xy()
    #sig2 = general_utils.meddistance(xytr, subsample=1000)
    #k = kernel_utils.KGauss(sig2)

    # choose the best parameter and perform a test with permutations
    med = general_utils.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-3, 3, 10))))
    list_gwidth.sort()

    list_kernels = [kernel_utils.KGauss(gw2) for gw2 in list_gwidth]

    # grid search to choose the best Gaussian width
    besti, powers = QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha)
    # perform test
    best_ker = list_kernels[besti]

    mmd_test = QuadMMDTest(best_ker, n_permute=200, alpha=alpha)
    if output == 'stat':
        return mmd_test.compute_stat(te)
    if output == 'p_value':
        return mmd_test.compute_pvalue(te)
    if output == 'full':
        return mmd_test.perform_test(te)
Exemple #8
0
def test_KMM():
    x = 11 * np.random.random(200) - 6.0  # x lies in [-6,5]
    y = x**2 + 10 * np.random.random(200) - 5
    x1 = np.c_[x, y]

    x = 2 * np.random.random(100) - 6.0  # x lies in [-6,-4]
    y = x**2 + 10 * np.random.random(100) - 5
    x2 = np.c_[x, y]

    x1x2 = np.vstack((x1, x2))
    sig2 = general_utils.meddistance(x1x2, subsample=1000)
    print(sig2)
    k = kernel_utils.KGauss(sig2)

    coef = tst.WQuadMMDTest.kernel_mean_matching(x1, x2, k, B=10)

    plt.close()
    plt.figure()
    plt.scatter(x1[:, 0], x1[:, 1], color='black', marker='x')
    plt.scatter(x2[:, 0], x2[:, 1], color='red')
    plt.scatter(x1[:, 0], x1[:, 1], color='green', s=coef * 10, alpha=0.5)

    np.sum(coef > 1e-2)