def MI_test(n, burn_in, cc_samples, which_test, n_MI_samples=500, correlation=0): M_c, X_Ls, X_Ds = do_test(0, 0, n, burn_in, cc_samples, "correlated", correlation=correlation, do_plot=False) # query column 0 and 1 MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0, 1)], n_samples=n_MI_samples) MI = numpy.mean(MI) Linfoot = numpy.mean(Linfoot) if which_test == "correlated": test_strn = "Test: correlation (%1.2f), N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % ( correlation, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot) else: test_strn = "Test: %s, N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % ( which_test, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot) print test_strn return test_strn
def MI_test(n, burn_in, cc_samples, which_test, n_MI_samples=500, correlation=0): get_next_seed = lambda: random.randrange(32000) M_c, X_Ls, X_Ds = do_test(0, 0, n, burn_in, cc_samples, "correlated", correlation=correlation, do_plot=False) # query column 0 and 1 MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0, 1)], get_next_seed, n_samples=n_MI_samples) MI = numpy.mean(MI) Linfoot = numpy.mean(Linfoot) if which_test == "correlated": test_strn = ( "Test: correlation (%1.2f), N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % (correlation, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot) ) else: test_strn = "Test: %s, N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % ( which_test, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot, ) print(test_strn) return test_strn
def run_mi_test_local(data_dict): gen_seed = data_dict['SEED'] crosscat_seed = data_dict['CCSEED'] num_clusters = data_dict['num_clusters'] num_cols = data_dict['num_cols'] num_rows = data_dict['num_rows'] num_views = data_dict['num_views'] corr = data_dict['corr'] burn_in = data_dict['burn_in'] mean_range = float(num_clusters)*2.0 # 32 bit signed int random.seed(gen_seed) get_next_seed = lambda : random.randrange(2147483647) # generate the stats T, M_c, M_r, X_L, X_D, view_assignment = mitu.generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=gen_seed); table_data = dict(T=T,M_c=M_c) engine = LE.LocalEngine(crosscat_seed) X_L_prime, X_D_prime = engine.analyze(M_c, T, X_L, X_D, n_steps=burn_in) X_L = X_L_prime X_D = X_D_prime view_assignment = numpy.array(X_L['column_partition']['assignments']) # for each view calclate the average MI between all pairs of columns n_views = max(view_assignment)+1 MI = [] Linfoot = [] queries = [] MI = 0.0 pairs = 0.0 for view in range(n_views): columns_in_view = numpy.nonzero(view_assignment==view)[0] combinations = itertools.combinations(columns_in_view,2) for pair in combinations: any_pairs = True queries.append(pair) MI_i, Linfoot_i = iu.mutual_information(M_c, [X_L], [X_D], [pair], n_samples=1000) MI += MI_i[0][0] pairs += 1.0 if pairs > 0.0: MI /= pairs ret_dict = dict( id=data_dict['id'], dataset=data_dict['dataset'], sample=data_dict['sample'], mi=MI, ) return ret_dict
def mutual_information( self, M_c, X_L_list, X_D_list, Q, seed, n_samples=1000): """Estimate mutual information for each pair of columns on Q given the set of samples. :param Q: List of tuples where each tuple contains the two column indexes to compare :type Q: list of two-tuples of ints :param n_samples: the number of simple predictive samples to use :type n_samples: int :returns: list of list -- where each sublist is a set of MIs and Linfoots from each crosscat sample. """ get_next_seed = make_get_next_seed(seed) return iu.mutual_information( M_c, X_L_list, X_D_list, Q, get_next_seed, n_samples)
def mutual_information(self, M_c, X_L_list, X_D_list, Q, n_samples=1000): """ Return the estimated mutual information for each pair of columns on Q given the set of samples. :param M_c: The column metadata :type M_c: dict :param X_L_list: list of the latent variables associated with the latent state :type X_L_list: list of dict :param X_D_list: list of the particular cluster assignments of each row in each view :type X_D_list: list of list of lists :param Q: List of tuples where each tuple contains the two column indexes to compare :type Q: list of two-tuples of ints :param n_samples: the number of simple predictive samples to use :type n_samples: int :returns: list of list, where each sublist is a set of MIs and Linfoots from each crosscat sample. """ return iu.mutual_information(M_c, X_L_list, X_D_list, Q, n_samples)
datas.append(T) print "num_samples: %i, width: %f" % (n, w) M_c = du.gen_M_c_from_T(T, cctypes) X_Ls = [] X_Ds = [] for ns in range(n_samples): state = State.p_State(M_c, T) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0, 1)], n_samples=5000) data_d = numpy.transpose(MI) if nr == 0: data = data_d else: data = numpy.hstack((data, data_d)) mi_ests[nr] = mi_est nr += 1 pl.figure(tight_layout=True, figsize=(len(widths) * 4, 4)) i = 0
pr, p = pearsonr(T[:,0], T[:,1]) print "num_samples: %i, R: %f, d: %i. Actual R: %f" % (n, r, d+1, pr) M_c = du.gen_M_c_from_T(T) X_Ls = [] X_Ds = [] for _ in range(n_samples): state = State.p_State(M_c, T) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)], n_samples=200) if d == 0: data_d = numpy.transpose(Linfoot) else: data_d = numpy.vstack((data_d, numpy.transpose(Linfoot))) if nr == 0: data = data_d else: data = numpy.hstack((data, data_d)) nr += 1 pl.subplot(2,2,subplot)
def run_test(args): rho = args.rho num_times = args.num_times min_num_rows = args.min_num_rows max_num_rows = args.max_num_rows n_grid = args.n_grid filename = args.filename discrete = args.discrete num_samples = [] for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist(): num_samples.append(int(ns)) variances = [] burn_in = 200 MIs = numpy.zeros((num_times, len(num_samples))) mi_diff = numpy.zeros((len(num_samples), num_times)) if not discrete: T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho) cctypes = ['continuous'] * 2 else: T, true_mi, external_mi = gen_correlated_data_discrete( num_samples[-1], rho) cctypes = ['multinomial'] * 2 data_subs = [] n_index = 0 for n in num_samples: T_sub = numpy.copy(T[0:n - 1, :]) data = [] data_subs.append(T_sub) print("%i: " % n) for t in range(num_times): M_c = du.gen_M_c_from_T(T_sub, cctypes) state = State.p_State(M_c, T_sub) state.transition(n_steps=burn_in) X_D = state.get_X_D() X_L = state.get_X_L() MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000) mi_diff[n_index, t] = true_mi - MI[0][0] print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0])) MIs[t, n_index] = MI[0][0] n_index += 1 if discrete: dtype_str = "discrete" else: dtype_str = "continuous" basefilename = filename + str(int(time.time())) figname = basefilename + ".png" datname = basefilename + "_DATA.png" pl.figure # plot data # pl.subplot(1,2,1) pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4)) i = 0 for T_s in data_subs: pl.subplot(1, len(data_subs), i + 1) num_rows = num_samples[i] if discrete: heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] pl.imshow(heatmap, extent=extent, interpolation="nearest") else: pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81) pl.title('#r: ' + str(num_rows)) i += 1 pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(datname) pl.clf() pl.figure(tight_layout=True, figsize=(5, 4)) # plot convergence # pl.subplot(1,2,2) # standard deviation stderr = numpy.std(MIs, axis=0) #/(float(num_times)**.5) mean = numpy.mean(MIs, axis=0) pl.errorbar(num_samples, mean, yerr=stderr, c='blue') pl.plot(num_samples, mean, c="blue", alpha=.8, label='mean MI') pl.plot(num_samples, [true_mi] * len(num_samples), color='red', alpha=.8, label='true MI') pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, .5, .5), alpha=.8, label='external MI') pl.title('convergence') pl.xlabel('#rows in X (log)') pl.ylabel('CrossCat MI - true MI') pl.legend(loc=0, prop={'size': 8}) pl.gca().set_xscale('log') # save output pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(figname)
def run_test(args): rho = args.rho num_times = args.num_times min_num_rows = args.min_num_rows max_num_rows = args.max_num_rows n_grid = args.n_grid filename = args.filename discrete = args.discrete num_samples = [] for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist(): num_samples.append(int(ns)) variances = [] burn_in = 200 MIs = numpy.zeros((num_times, len(num_samples))) mi_diff = numpy.zeros((len(num_samples), num_times)) if not discrete: T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho) cctypes = ["continuous"] * 2 else: T, true_mi, external_mi = gen_correlated_data_discrete(num_samples[-1], rho) cctypes = ["multinomial"] * 2 data_subs = [] n_index = 0 for n in num_samples: T_sub = numpy.copy(T[0 : n - 1, :]) data = [] data_subs.append(T_sub) print("%i: " % n) for t in range(num_times): M_c = du.gen_M_c_from_T(T_sub, cctypes) state = State.p_State(M_c, T_sub) state.transition(n_steps=burn_in) X_D = state.get_X_D() X_L = state.get_X_L() MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000) mi_diff[n_index, t] = true_mi - MI[0][0] print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0])) MIs[t, n_index] = MI[0][0] n_index += 1 if discrete: dtype_str = "discrete" else: dtype_str = "continuous" basefilename = filename + str(int(time.time())) figname = basefilename + ".png" datname = basefilename + "_DATA.png" pl.figure # plot data # pl.subplot(1,2,1) pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4)) i = 0 for T_s in data_subs: pl.subplot(1, len(data_subs), i + 1) num_rows = num_samples[i] if discrete: heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] pl.imshow(heatmap, extent=extent, interpolation="nearest") else: pl.scatter(T_s[:, 0], T_s[:, 1], alpha=0.3, s=81) pl.title("#r: " + str(num_rows)) i += 1 pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(datname) pl.clf() pl.figure(tight_layout=True, figsize=(5, 4)) # plot convergence # pl.subplot(1,2,2) # standard deviation stderr = numpy.std(MIs, axis=0) # /(float(num_times)**.5) mean = numpy.mean(MIs, axis=0) pl.errorbar(num_samples, mean, yerr=stderr, c="blue") pl.plot(num_samples, mean, c="blue", alpha=0.8, label="mean MI") pl.plot(num_samples, [true_mi] * len(num_samples), color="red", alpha=0.8, label="true MI") pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, 0.5, 0.5), alpha=0.8, label="external MI") pl.title("convergence") pl.xlabel("#rows in X (log)") pl.ylabel("CrossCat MI - true MI") pl.legend(loc=0, prop={"size": 8}) pl.gca().set_xscale("log") # save output pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(figname)
pr, p = pearsonr(T[:,0], T[:,1]) print("num_samples: %i, R: %f, d: %i. Actual R: %f" % (n, r, d+1, pr)) M_c = du.gen_M_c_from_T(T,cctypes) X_Ls = [] X_Ds = [] for _ in range(n_samples): state = State.p_State(M_c, T) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)], get_next_seed, n_samples=5000) if d == 0: data_d = numpy.transpose(Linfoot) else: data_d = numpy.vstack((data_d, numpy.transpose(Linfoot))) if nr == 0: data = data_d else: data = numpy.hstack((data, data_d)) nr += 1 if discrete: