def SDCIT(Kx: np.ndarray, Ky: np.ndarray, Kz: np.ndarray, Dz=None, size_of_null_sample=1000, with_null=False, seed=None, adjust=True, to_shuffle=True): """SDCIT (Lee and Honavar, 2017) Parameters ---------- Kx : np.ndarray N by N kernel matrix of X Ky : np.ndarray N by N kernel matrix of Y Kz : np.ndarray N by N kernel matrix of Z Dz : np.ndarray N by N pairwise distance matrix of Z size_of_null_sample : int The number of samples in a null distribution with_null : bool If true, resulting null distribution is also returned seed : int Random seed adjust : bool whether to adjust null distribution and test statistics based on 'permutation error' information to_shuffle : bool shuffle the order of given data at the beginning, which minimize possible issues with getting a bad permutation References ---------- Lee, S., Honavar, V. (2017). Self-Discrepancy Conditional Independence Test. In Proceedings of the Thirty-third Conference on Uncertainty in Artificial Intelligence. Corvallis, Oregon: AUAI Press. """ if seed is not None: np.random.seed(seed) if Dz is None: Dz = K2D(Kz) if to_shuffle: Kx, Ky, Kz, Dz = shuffling(seed, Kx, Ky, Kz, Dz) # categorical Z may yield an ordered 'block' matrix and it may harm permutation. Kxz = Kx * Kz test_statistic, error_statistic, mask, _ = MMSD(Ky, Kz, Kxz, Dz) mask, Pidx = mask_and_perm(penalized_distance(Dz, mask)) # avoid permutation between already permuted pairs. mmsd_distr_under_null, error_distr_under_null = emp_MMSD(Kxz, Ky[np.ix_(Pidx, Pidx)], Kz, penalized_distance(Dz, mask), size_of_null_sample) if adjust: fix_null, fix_test_statistic = adjust_errors(error_distr_under_null, mmsd_distr_under_null, error_statistic, test_statistic) fix_null = fix_null - fix_null.mean() else: fix_null = mmsd_distr_under_null - mmsd_distr_under_null.mean() fix_test_statistic = test_statistic if with_null: return fix_test_statistic, p_value_of(fix_test_statistic, fix_null), fix_null else: return fix_test_statistic, p_value_of(fix_test_statistic, fix_null)
def para(N, independent, trial): outs = [] mat_load = scipy.io.loadmat(os.path.expanduser( SDCIT_DATA_DIR + '/{}_{}_{}_{}_chaotic.mat'.format('0.3', trial, independent, N)), squeeze_me=True, struct_as_record=False) data = mat_load['data'] if independent: X = data.Xt1 Y = data.Yt Z = data.Xt[:, 0:2] else: X = data.Yt1 Y = data.Xt Z = data.Yt[:, 0:2] DX = euclidean_distances(X, squared=True) DY = euclidean_distances(Y, squared=True) DZ = euclidean_distances(Z, squared=True) DX /= np.max(DX) DY /= np.max(DY) DZ /= np.max(DZ) mX = 0.5 / medd(DX) mY = 0.5 / medd(DY) mZ = 0.5 / medd(DZ) for multiplier in [ 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100 ]: KX = np.exp(-mX * DX * multiplier) KY = np.exp(-mY * DY * multiplier) KZ = np.exp(-mZ * DZ * multiplier) Dz = K2D(KZ) p_KCIT = python_kcit_K(KX, KY, KZ, seed=trial)[2] p_KCIT2 = python_kcit_K2(KX, KY, Z, seed=trial)[2] p_SDCIT = SDCIT(KX, KY, KZ, Dz=Dz, size_of_null_sample=500, seed=trial, to_shuffle=False)[1] outs.append(['SDCIT', N, trial, multiplier, independent, p_SDCIT]) outs.append(['KCIT', N, trial, multiplier, independent, p_KCIT]) outs.append(['KCIT2', N, trial, multiplier, independent, p_KCIT2]) return outs
def experiment(obj_filename): if not os.path.exists(obj_filename): trial = 0 gamma_param = 0.0 N = 400 independent = 1 initial_B = 100 kx, ky, kz, Dz = read_chaotic(independent, gamma_param, trial, N) # Compare SDCIT and KCIPT100 print('SDCIT ... ') sdcit_mmd, sdcit_pval, sdcit_null = SDCIT(kx, ky, kz, with_null=True, seed=trial, to_shuffle=False) print('KCIPT {} ... '.format(initial_B)) _, mmds100, _, outer_null100 = c_KCIPT(kx, ky, kz, K2D(kz), initial_B, 10000, 10000, n_jobs=PARALLEL_JOBS, seed=trial) # Infer desired B desired_B = int(initial_B * (outer_null100.std() / sdcit_null.std()) ** 2) print('Desired B: {}'.format(desired_B)) # Prepare outer null distribution print('KCIPT {} ... '.format(desired_B)) _, mmds_B, _, outer_null_B = c_KCIPT(kx, ky, kz, K2D(kz), desired_B, 10000, 10000, n_jobs=PARALLEL_JOBS, seed=trial) print('TS distributions for KCIPT {} ... '.format(desired_B)) time.sleep(1) distr_boot = np.zeros((1000,)) for ii in trange(len(distr_boot)): _, mmds_B, _, _ = c_KCIPT(kx, ky, kz, K2D(kz), desired_B, 0, 0, n_jobs=PARALLEL_JOBS, seed=ii) distr_boot[ii] = mmds_B.mean() with open(obj_filename, 'wb') as f: # Python 3: open(..., 'wb') pickle.dump([sdcit_mmd, sdcit_null, mmds100, outer_null100, desired_B, mmds_B, outer_null_B, distr_boot], f) print(independent, gamma_param, N) outs = [test_chaotic(independent, gamma_param, tt, N, B=desired_B, n_jobs=PARALLEL_JOBS) for tt in trange(300)] with open(SDCIT_RESULT_DIR + '/kcipt_chaotic_{}.csv'.format(desired_B), 'a') as f: for out in outs: print(*out, sep=',', file=f, flush=True)
def viz_SDCIT_adjust(Kx: np.ndarray, Ky: np.ndarray, Kz: np.ndarray, Dz=None, size_of_null_sample=1000, seed=None): if seed is not None: np.random.seed(seed) if Dz is None: Dz = K2D(Kz) Kxz = Kx * Kz test_statistic, error_statistic, mask, _ = MMSD(Ky, Kz, Kxz, Dz) mask, Pidx = mask_and_perm(penalized_distance(Dz, mask)) # avoid permutation between already permuted pairs. mmsd_distr_under_null, error_distr_under_null = emp_MMSD( Kxz, Ky[np.ix_(Pidx, Pidx)], Kz, penalized_distance(Dz, mask), size_of_null_sample) fix_null, fix_test_statistic = viz_adjust_errors(error_distr_under_null, mmsd_distr_under_null, error_statistic, test_statistic) sns.set(style='white', font_scale=1) plt.figure(figsize=[4, 1.5]) g = sns.distplot(mmsd_distr_under_null * 1000, hist=True, kde=False, label='unadjusted') g.set(xticklabels=[]) g.set(yticklabels=[]) g = sns.distplot(fix_null * 1000, hist=True, kde=False, label='adjusted') g.set(yticklabels=[]) g.set(xticklabels=[]) plt.legend() sns.despine() plt.savefig('results/viz_adjust_error_two_dists.pdf', transparent=True, bbox_inches='tight', pad_inches=0.02) plt.close()
for b in [500, 1000]: for trial in range(300): mat_load = scipy.io.loadmat(os.path.expanduser( SDCIT_DATA_DIR + '/{}_{}_{}_{}_chaotic.mat'.format( '0.0', trial, independent, N)), squeeze_me=True, struct_as_record=False) data = mat_load['data'] X = data.Yt1 Y = data.Xt Z = data.Yt[:, 0:2] start = time.time() kkk = rbf_kernel_median(X, Y, Z) Dz = K2D(kkk[-1]) c_SDCIT(*kkk, Dz=Dz, size_of_null_sample=b, seed=trial, to_shuffle=False) endtime = time.time() print(endtime - start, trial, N, b, file=f, sep=',', flush=True)
def read_chaotic(independent, gamma, trial, N, dir_at=SDCIT_DATA_DIR + '/'): X, Y, Z = read_chaotic_data(independent, gamma, trial, N, dir_at) kx, ky, kz = rbf_kernel_median(X, Y, Z) Dz = K2D(kz) return kx, ky, kz, Dz
def simple_random_test_inspect(seed, structure_random_p, n, mu, sd, independent, vertex_kernel_hop, slope, fname, titlestr): np.random.seed(seed) # np random seed? U = RelationalVariable(RelationalPath([A]), X) V = RelationalVariable(RelationalPath([A, AB, B]), Y) W = None # 1. Structuring skeleton = generate_structure(n, structure_random_p) # 2. Values if independent: slope = 0 generate_values(independent, mu, sd, skeleton, slope) K_ZG0 = get_KZG(skeleton, vertex_kernel_hop, U, V, W, ignore_gk=False) K_ZG1 = get_KZG(skeleton, vertex_kernel_hop, U, V, W, ignore_gk=True) fetcher = SkeletonDataInterface(skeleton, to_shuffle=False) flatten_data = fetcher.flatten([U, V], with_base_items=True) base_items = flatten_data[:, 0] u_i_s = flatten_data[:, 1] v_i_s = flatten_data[:, 2] x_values = np.array( [pair[1] for tuple_of_pairs in u_i_s for pair in tuple_of_pairs]) y_values = np.array( [pair[1] for tuple_of_pairs in v_i_s for pair in tuple_of_pairs]) PP0 = permuted(K2D(K_ZG0)) PP1 = permuted(K2D(K_ZG1)) yy0 = y_values[PP0] yy1 = y_values[PP1] import pandas as pd import seaborn import matplotlib.pyplot as plt df0 = pd.DataFrame() df1 = pd.DataFrame() df2 = pd.DataFrame() # don't apply to permutation on 'cc'. This is to preserve colors! df0['x'] = x_values df0['y'] = y_values df0['cc'] = 2 * np.array( [len(skeleton.neighbors(item, AC)) for item in base_items]) + np.array([ len( skeleton.neighbors( list( terminal_set(skeleton, RelationalPath([A, AB, B]), item))[0], BD)) for item in base_items ]) df0['type'] = 'given' df1['x'] = x_values df1['y'] = yy0 df1['cc'] = df0['cc'] df1['type'] = 'with GK' # graph-kernel df2['x'] = x_values df2['y'] = yy1 df2['cc'] = df0['cc'] df2['type'] = 'without GK' # graph-kernel df = pd.concat([df0, df1, df2], ignore_index=True) df.groupby(by=['type', 'cc']).transform(lambda x: (x - x.min()) / (x.max() - x.min())) seaborn.set(style='white', font_scale=1.3, palette=seaborn.color_palette('Set1', 4)) plt.figure(figsize=[4, 4]) g = seaborn.FacetGrid(df, col="type", hue='cc', hue_order=[0, 3, 2, 1], size=3, aspect=1) g.map(plt.scatter, "x", "y", alpha=0.5, linewidth='0') titles = [' ', ' ', ' '] for ax, title in zip(g.axes.flat, titles): ax.set_title(title) ax.set_xticklabels([]) ax.set_yticklabels([]) ax.set_xlabel('') ax.set_ylabel('') plt.suptitle(titlestr) plt.savefig(fname, transparent=True, bbox_inches='tight', pad_inches=0.02) plt.close()
def c_SDCIT(Kx, Ky, Kz, Dz=None, size_of_null_sample=1000, with_null=False, seed=None, n_jobs=1, adjust=True, to_shuffle=True): """C-based SDCIT (Lee and Honavar, 2017) Parameters ---------- Kx : np.ndarray N by N kernel matrix of X Ky : np.ndarray N by N kernel matrix of Y Kz : np.ndarray N by N kernel matrix of Z Dz : np.ndarray N by N pairwise distance matrix of Z size_of_null_sample : int The number of samples in a null distribution with_null : bool If true, a resulting null distribution is also returned seed : int Random seed n_jobs: int number of threads to be used adjust : bool whether to adjust null distribution and test statistics based on 'permutation error' information to_shuffle : bool shuffle the order of given data at the beginning, which minimize possible issues with getting a bad permutation References ---------- Lee, S., Honavar, V. (2017). Self-Discrepancy Conditional Independence Test. In Proceedings of the Thirty-third Conference on Uncertainty in Artificial Intelligence. Corvallis, Oregon: AUAI Press. """ if seed is not None: np.random.seed(seed) if Dz is None: Dz = K2D(Kz) if to_shuffle: Kx, Ky, Kz, Dz = shuffling(seed, Kx, Ky, Kz, Dz) # categorical Z may yield an ordered 'block' matrix and it may harm permutation. Kxz = Kx * Kz # prepare parameters & output variables Kxz, Ky, Kz, Dz = cythonize(Kxz, Ky, Kz, Dz) raw_null = np.zeros((size_of_null_sample,), dtype='float64') error_raw_null = np.zeros((size_of_null_sample,), dtype='float64') mmsd = np.zeros((1,), dtype='float64') error_mmsd = np.zeros((1,), dtype='float64') # run SDCIT cy_sdcit(Kxz, Ky, Kz, Dz, size_of_null_sample, random_seeds(), n_jobs, mmsd, error_mmsd, raw_null, error_raw_null) # post-process outputs test_statistic = mmsd[0] error_statistic = error_mmsd[0] raw_null = 0.5 * (raw_null - raw_null.mean()) + raw_null.mean() error_raw_null = 0.5 * (error_raw_null - error_raw_null.mean()) + error_raw_null.mean() if adjust: fix_null, fix_test_statistic = adjust_errors(error_raw_null, raw_null, error_statistic, test_statistic) fix_null = fix_null - fix_null.mean() else: fix_null = raw_null - raw_null.mean() fix_test_statistic = test_statistic if with_null: return fix_test_statistic, p_value_of(fix_test_statistic, fix_null), fix_null else: return fix_test_statistic, p_value_of(fix_test_statistic, fix_null)