Ejemplo n.º 1
0
def SDCIT(Kx: np.ndarray, Ky: np.ndarray, Kz: np.ndarray, Dz=None, size_of_null_sample=1000, with_null=False, seed=None, adjust=True, to_shuffle=True):
    """SDCIT (Lee and Honavar, 2017)

    Parameters
    ----------
    Kx : np.ndarray
        N by N kernel matrix of X
    Ky : np.ndarray
        N by N kernel matrix of Y
    Kz : np.ndarray
        N by N kernel matrix of Z
    Dz : np.ndarray
        N by N pairwise distance matrix of Z
    size_of_null_sample : int
        The number of samples in a null distribution
    with_null : bool
        If true, resulting null distribution is also returned
    seed : int
        Random seed
    adjust : bool
        whether to adjust null distribution and test statistics based on 'permutation error' information
    to_shuffle : bool
        shuffle the order of given data at the beginning, which minimize possible issues with getting a bad permutation

    References
    ----------
        Lee, S., Honavar, V. (2017). Self-Discrepancy Conditional Independence Test.
        In Proceedings of the Thirty-third Conference on Uncertainty in Artificial Intelligence. Corvallis, Oregon: AUAI Press.
    """
    if seed is not None:
        np.random.seed(seed)

    if Dz is None:
        Dz = K2D(Kz)

    if to_shuffle:
        Kx, Ky, Kz, Dz = shuffling(seed, Kx, Ky, Kz, Dz)  # categorical Z may yield an ordered 'block' matrix and it may harm permutation.

    Kxz = Kx * Kz

    test_statistic, error_statistic, mask, _ = MMSD(Ky, Kz, Kxz, Dz)
    mask, Pidx = mask_and_perm(penalized_distance(Dz, mask))

    # avoid permutation between already permuted pairs.
    mmsd_distr_under_null, error_distr_under_null = emp_MMSD(Kxz, Ky[np.ix_(Pidx, Pidx)], Kz, penalized_distance(Dz, mask), size_of_null_sample)

    if adjust:
        fix_null, fix_test_statistic = adjust_errors(error_distr_under_null, mmsd_distr_under_null, error_statistic, test_statistic)
        fix_null = fix_null - fix_null.mean()
    else:
        fix_null = mmsd_distr_under_null - mmsd_distr_under_null.mean()
        fix_test_statistic = test_statistic

    if with_null:
        return fix_test_statistic, p_value_of(fix_test_statistic, fix_null), fix_null
    else:
        return fix_test_statistic, p_value_of(fix_test_statistic, fix_null)
Ejemplo n.º 2
0
def para(N, independent, trial):
    outs = []
    mat_load = scipy.io.loadmat(os.path.expanduser(
        SDCIT_DATA_DIR +
        '/{}_{}_{}_{}_chaotic.mat'.format('0.3', trial, independent, N)),
                                squeeze_me=True,
                                struct_as_record=False)
    data = mat_load['data']
    if independent:
        X = data.Xt1
        Y = data.Yt
        Z = data.Xt[:, 0:2]
    else:
        X = data.Yt1
        Y = data.Xt
        Z = data.Yt[:, 0:2]

    DX = euclidean_distances(X, squared=True)
    DY = euclidean_distances(Y, squared=True)
    DZ = euclidean_distances(Z, squared=True)

    DX /= np.max(DX)
    DY /= np.max(DY)
    DZ /= np.max(DZ)

    mX = 0.5 / medd(DX)
    mY = 0.5 / medd(DY)
    mZ = 0.5 / medd(DZ)

    for multiplier in [
            0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100
    ]:
        KX = np.exp(-mX * DX * multiplier)
        KY = np.exp(-mY * DY * multiplier)
        KZ = np.exp(-mZ * DZ * multiplier)
        Dz = K2D(KZ)

        p_KCIT = python_kcit_K(KX, KY, KZ, seed=trial)[2]
        p_KCIT2 = python_kcit_K2(KX, KY, Z, seed=trial)[2]
        p_SDCIT = SDCIT(KX,
                        KY,
                        KZ,
                        Dz=Dz,
                        size_of_null_sample=500,
                        seed=trial,
                        to_shuffle=False)[1]

        outs.append(['SDCIT', N, trial, multiplier, independent, p_SDCIT])
        outs.append(['KCIT', N, trial, multiplier, independent, p_KCIT])
        outs.append(['KCIT2', N, trial, multiplier, independent, p_KCIT2])
    return outs
def experiment(obj_filename):
    if not os.path.exists(obj_filename):
        trial = 0
        gamma_param = 0.0
        N = 400
        independent = 1
        initial_B = 100
        kx, ky, kz, Dz = read_chaotic(independent, gamma_param, trial, N)

        # Compare SDCIT and KCIPT100
        print('SDCIT ... ')
        sdcit_mmd, sdcit_pval, sdcit_null = SDCIT(kx, ky, kz, with_null=True, seed=trial, to_shuffle=False)
        print('KCIPT {} ... '.format(initial_B))
        _, mmds100, _, outer_null100 = c_KCIPT(kx, ky, kz, K2D(kz), initial_B, 10000, 10000, n_jobs=PARALLEL_JOBS, seed=trial)

        # Infer desired B
        desired_B = int(initial_B * (outer_null100.std() / sdcit_null.std()) ** 2)
        print('Desired B: {}'.format(desired_B))

        # Prepare outer null distribution
        print('KCIPT {} ... '.format(desired_B))
        _, mmds_B, _, outer_null_B = c_KCIPT(kx, ky, kz, K2D(kz), desired_B, 10000, 10000, n_jobs=PARALLEL_JOBS, seed=trial)

        print('TS distributions for KCIPT {} ... '.format(desired_B))
        time.sleep(1)
        distr_boot = np.zeros((1000,))
        for ii in trange(len(distr_boot)):
            _, mmds_B, _, _ = c_KCIPT(kx, ky, kz, K2D(kz), desired_B, 0, 0, n_jobs=PARALLEL_JOBS, seed=ii)
            distr_boot[ii] = mmds_B.mean()

        with open(obj_filename, 'wb') as f:  # Python 3: open(..., 'wb')
            pickle.dump([sdcit_mmd, sdcit_null, mmds100, outer_null100, desired_B, mmds_B, outer_null_B, distr_boot], f)

        print(independent, gamma_param, N)
        outs = [test_chaotic(independent, gamma_param, tt, N, B=desired_B, n_jobs=PARALLEL_JOBS) for tt in trange(300)]
        with open(SDCIT_RESULT_DIR + '/kcipt_chaotic_{}.csv'.format(desired_B), 'a') as f:
            for out in outs:
                print(*out, sep=',', file=f, flush=True)
Ejemplo n.º 4
0
def viz_SDCIT_adjust(Kx: np.ndarray,
                     Ky: np.ndarray,
                     Kz: np.ndarray,
                     Dz=None,
                     size_of_null_sample=1000,
                     seed=None):
    if seed is not None:
        np.random.seed(seed)

    if Dz is None:
        Dz = K2D(Kz)

    Kxz = Kx * Kz

    test_statistic, error_statistic, mask, _ = MMSD(Ky, Kz, Kxz, Dz)
    mask, Pidx = mask_and_perm(penalized_distance(Dz, mask))

    # avoid permutation between already permuted pairs.
    mmsd_distr_under_null, error_distr_under_null = emp_MMSD(
        Kxz, Ky[np.ix_(Pidx, Pidx)], Kz, penalized_distance(Dz, mask),
        size_of_null_sample)

    fix_null, fix_test_statistic = viz_adjust_errors(error_distr_under_null,
                                                     mmsd_distr_under_null,
                                                     error_statistic,
                                                     test_statistic)

    sns.set(style='white', font_scale=1)
    plt.figure(figsize=[4, 1.5])
    g = sns.distplot(mmsd_distr_under_null * 1000,
                     hist=True,
                     kde=False,
                     label='unadjusted')
    g.set(xticklabels=[])
    g.set(yticklabels=[])
    g = sns.distplot(fix_null * 1000, hist=True, kde=False, label='adjusted')
    g.set(yticklabels=[])
    g.set(xticklabels=[])
    plt.legend()
    sns.despine()
    plt.savefig('results/viz_adjust_error_two_dists.pdf',
                transparent=True,
                bbox_inches='tight',
                pad_inches=0.02)
    plt.close()
Ejemplo n.º 5
0
                for b in [500, 1000]:
                    for trial in range(300):
                        mat_load = scipy.io.loadmat(os.path.expanduser(
                            SDCIT_DATA_DIR + '/{}_{}_{}_{}_chaotic.mat'.format(
                                '0.0', trial, independent, N)),
                                                    squeeze_me=True,
                                                    struct_as_record=False)
                        data = mat_load['data']
                        X = data.Yt1
                        Y = data.Xt
                        Z = data.Yt[:, 0:2]

                        start = time.time()

                        kkk = rbf_kernel_median(X, Y, Z)
                        Dz = K2D(kkk[-1])
                        c_SDCIT(*kkk,
                                Dz=Dz,
                                size_of_null_sample=b,
                                seed=trial,
                                to_shuffle=False)

                        endtime = time.time()
                        print(endtime - start,
                              trial,
                              N,
                              b,
                              file=f,
                              sep=',',
                              flush=True)
Ejemplo n.º 6
0
def read_chaotic(independent, gamma, trial, N, dir_at=SDCIT_DATA_DIR + '/'):
    X, Y, Z = read_chaotic_data(independent, gamma, trial, N, dir_at)
    kx, ky, kz = rbf_kernel_median(X, Y, Z)
    Dz = K2D(kz)
    return kx, ky, kz, Dz
Ejemplo n.º 7
0
def simple_random_test_inspect(seed, structure_random_p, n, mu, sd,
                               independent, vertex_kernel_hop, slope, fname,
                               titlestr):
    np.random.seed(seed)
    # np random seed?
    U = RelationalVariable(RelationalPath([A]), X)
    V = RelationalVariable(RelationalPath([A, AB, B]), Y)
    W = None

    # 1. Structuring
    skeleton = generate_structure(n, structure_random_p)

    # 2. Values
    if independent:
        slope = 0

    generate_values(independent, mu, sd, skeleton, slope)

    K_ZG0 = get_KZG(skeleton, vertex_kernel_hop, U, V, W, ignore_gk=False)
    K_ZG1 = get_KZG(skeleton, vertex_kernel_hop, U, V, W, ignore_gk=True)

    fetcher = SkeletonDataInterface(skeleton, to_shuffle=False)
    flatten_data = fetcher.flatten([U, V], with_base_items=True)
    base_items = flatten_data[:, 0]
    u_i_s = flatten_data[:, 1]
    v_i_s = flatten_data[:, 2]

    x_values = np.array(
        [pair[1] for tuple_of_pairs in u_i_s for pair in tuple_of_pairs])
    y_values = np.array(
        [pair[1] for tuple_of_pairs in v_i_s for pair in tuple_of_pairs])

    PP0 = permuted(K2D(K_ZG0))
    PP1 = permuted(K2D(K_ZG1))
    yy0 = y_values[PP0]
    yy1 = y_values[PP1]

    import pandas as pd
    import seaborn
    import matplotlib.pyplot as plt
    df0 = pd.DataFrame()
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    # don't apply to permutation on 'cc'. This is to preserve colors!
    df0['x'] = x_values
    df0['y'] = y_values
    df0['cc'] = 2 * np.array(
        [len(skeleton.neighbors(item, AC))
         for item in base_items]) + np.array([
             len(
                 skeleton.neighbors(
                     list(
                         terminal_set(skeleton, RelationalPath([A, AB, B]),
                                      item))[0], BD)) for item in base_items
         ])
    df0['type'] = 'given'

    df1['x'] = x_values
    df1['y'] = yy0
    df1['cc'] = df0['cc']
    df1['type'] = 'with GK'  # graph-kernel

    df2['x'] = x_values
    df2['y'] = yy1
    df2['cc'] = df0['cc']
    df2['type'] = 'without GK'  # graph-kernel

    df = pd.concat([df0, df1, df2], ignore_index=True)

    df.groupby(by=['type', 'cc']).transform(lambda x: (x - x.min()) /
                                            (x.max() - x.min()))

    seaborn.set(style='white',
                font_scale=1.3,
                palette=seaborn.color_palette('Set1', 4))
    plt.figure(figsize=[4, 4])
    g = seaborn.FacetGrid(df,
                          col="type",
                          hue='cc',
                          hue_order=[0, 3, 2, 1],
                          size=3,
                          aspect=1)
    g.map(plt.scatter, "x", "y", alpha=0.5, linewidth='0')

    titles = [' ', ' ', ' ']
    for ax, title in zip(g.axes.flat, titles):
        ax.set_title(title)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_xlabel('')
        ax.set_ylabel('')
    plt.suptitle(titlestr)
    plt.savefig(fname, transparent=True, bbox_inches='tight', pad_inches=0.02)
    plt.close()
Ejemplo n.º 8
0
def c_SDCIT(Kx, Ky, Kz, Dz=None, size_of_null_sample=1000, with_null=False, seed=None, n_jobs=1, adjust=True, to_shuffle=True):
    """C-based SDCIT (Lee and Honavar, 2017)

    Parameters
    ----------
    Kx : np.ndarray
        N by N kernel matrix of X
    Ky : np.ndarray
        N by N kernel matrix of Y
    Kz : np.ndarray
        N by N kernel matrix of Z
    Dz : np.ndarray
        N by N pairwise distance matrix of Z
    size_of_null_sample : int
        The number of samples in a null distribution
    with_null : bool
        If true, a resulting null distribution is also returned
    seed : int
        Random seed
    n_jobs: int
        number of threads to be used
    adjust : bool
        whether to adjust null distribution and test statistics based on 'permutation error' information
    to_shuffle : bool
        shuffle the order of given data at the beginning, which minimize possible issues with getting a bad permutation


    References
    ----------
        Lee, S., Honavar, V. (2017). Self-Discrepancy Conditional Independence Test.
        In Proceedings of the Thirty-third Conference on Uncertainty in Artificial Intelligence. Corvallis, Oregon: AUAI Press.
    """
    if seed is not None:
        np.random.seed(seed)

    if Dz is None:
        Dz = K2D(Kz)

    if to_shuffle:
        Kx, Ky, Kz, Dz = shuffling(seed, Kx, Ky, Kz, Dz)  # categorical Z may yield an ordered 'block' matrix and it may harm permutation.

    Kxz = Kx * Kz

    # prepare parameters & output variables
    Kxz, Ky, Kz, Dz = cythonize(Kxz, Ky, Kz, Dz)
    raw_null = np.zeros((size_of_null_sample,), dtype='float64')
    error_raw_null = np.zeros((size_of_null_sample,), dtype='float64')
    mmsd = np.zeros((1,), dtype='float64')
    error_mmsd = np.zeros((1,), dtype='float64')

    # run SDCIT
    cy_sdcit(Kxz, Ky, Kz, Dz, size_of_null_sample, random_seeds(), n_jobs, mmsd, error_mmsd, raw_null, error_raw_null)

    # post-process outputs
    test_statistic = mmsd[0]
    error_statistic = error_mmsd[0]
    raw_null = 0.5 * (raw_null - raw_null.mean()) + raw_null.mean()
    error_raw_null = 0.5 * (error_raw_null - error_raw_null.mean()) + error_raw_null.mean()

    if adjust:
        fix_null, fix_test_statistic = adjust_errors(error_raw_null, raw_null, error_statistic, test_statistic)
        fix_null = fix_null - fix_null.mean()
    else:
        fix_null = raw_null - raw_null.mean()
        fix_test_statistic = test_statistic

    if with_null:
        return fix_test_statistic, p_value_of(fix_test_statistic, fix_null), fix_null
    else:
        return fix_test_statistic, p_value_of(fix_test_statistic, fix_null)