def GenDataFromPartitions(col_part,row_parts,mean_gen,std_gen,std_data): n_cols = len(col_part) n_rows = row_parts.shape[1] seed = int(time()*100) np.random.seed(seed) T = np.zeros((n_rows,n_cols)) for col in range(n_cols): view = col_part[col] row_part = row_parts[view,:] cats = max(row_part)+1 for cat in range(cats): row_dex = np.nonzero(row_part==cat)[0] n_rows_cat = len(row_dex) mean = np.random.normal(mean_gen,std_gen) X = np.random.normal(mean,std_data,(n_rows_cat,1)) i = 0 for row in row_dex: T[row,col] = X[i] i += 1 T = T.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) return T, M_r, M_c
def generate_multinomial_data(next_seed, n_cols, n_rows, n_views): # generate the partitions random.seed(next_seed) cols_to_views = [0 for _ in range(n_cols)] rows_in_views_to_cols = [] for view in range(n_views): partition = eu.CRP(n_rows, 2.0) random.shuffle(partition) rows_in_views_to_cols.append(partition) # generate the data data = numpy.zeros((n_rows, n_cols), dtype=float) for col in range(n_cols): view = cols_to_views[col] for row in range(n_rows): cluster = rows_in_views_to_cols[view][row] data[row, col] = cluster T = data.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) T, M_c = du.convert_columns_to_multinomial(T, M_c, list(range(n_cols))) return T, M_r, M_c
def _forward_sample_from_prior(inf_seed_and_n_samples, M_c, T, probe_columns=(0,), ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(), S_GRID=(), MU_GRID=(), N_GRID=default_n_grid, ): inf_seed, n_samples = inf_seed_and_n_samples T = numpy.zeros(numpy.array(T).shape).tolist() M_r = du.gen_M_r_from_T(T) engine = LE.LocalEngine(inf_seed) diagnostics_data = collections.defaultdict(list) diagnostics_funcs = None for sample_idx in range(n_samples): X_L, X_D = engine.initialize(M_c, M_r, T, ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID, MU_GRID=MU_GRID, N_GRID=N_GRID, ) if diagnostics_funcs is None: diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns) diagnostics_data = collect_diagnostics(X_L, diagnostics_data, diagnostics_funcs) pass return diagnostics_data
def GenDataFromPartitions(col_part, row_parts, mean_gen, std_gen, std_data, seed): n_cols = len(col_part) n_rows = row_parts.shape[1] rng = np.random.RandomState(seed) T = np.zeros((n_rows, n_cols)) for col in range(n_cols): view = col_part[col] row_part = row_parts[view, :] cats = max(row_part) + 1 for cat in range(cats): row_dex = np.nonzero(row_part == cat)[0] n_rows_cat = len(row_dex) mean = rng.normal(mean_gen, std_gen) X = rng.normal(mean, std_data, (n_rows_cat, 1)) i = 0 for row in row_dex: T[row, col] = X[i] i += 1 T = T.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) return T, M_r, M_c
def generate_multinomial_data(next_seed,n_cols,n_rows,n_views): # generate the partitions random.seed(next_seed) cols_to_views = [0 for _ in range(n_cols)] rows_in_views_to_cols = [] for view in range(n_views): partition = eu.CRP(n_rows,2.0) random.shuffle(partition) rows_in_views_to_cols.append(partition) # generate the data data = numpy.zeros((n_rows,n_cols),dtype=float) for col in range(n_cols): view = cols_to_views[col] for row in range(n_rows): cluster = rows_in_views_to_cols[view][row] data[row,col] = cluster T = data.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) T, M_c = du.convert_columns_to_multinomial(T, M_c, range(n_cols)) return T, M_r, M_c
def run_posterior_chain(seed, M_c, T, num_iters, probe_columns=(0,), ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(), S_GRID=(), MU_GRID=(), N_GRID=default_n_grid, plot_rand_idx=None, ): plot_rand_idx = arbitrate_plot_rand_idx(plot_rand_idx, num_iters) engine = LE.LocalEngine(seed) M_r = du.gen_M_r_from_T(T) X_L, X_D = engine.initialize(M_c, M_r, T, 'from_the_prior', ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID, MU_GRID=MU_GRID, N_GRID=N_GRID, ) diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns) diagnostics_data = collections.defaultdict(list) for idx in range(num_iters): M_c, T, X_L, X_D = run_posterior_chain_iter(engine, M_c, T, X_L, X_D, diagnostics_data, diagnostics_funcs, ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID, S_GRID, MU_GRID, N_GRID=N_GRID, ) if idx == plot_rand_idx: # This DOESN'T work with multithreading filename = 'T_%s' % idx pu.plot_views(numpy.array(T), X_D, X_L, M_c, filename=filename, dir='./', close=True, format=image_format) pass pass return diagnostics_data
def run_posterior_chain( seed, M_c, T, num_iters, probe_columns=(0, ), ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(), S_GRID=(), MU_GRID=(), N_GRID=default_n_grid, CT_KERNEL=0, plot_rand_idx=None, ): plot_rand_idx = arbitrate_plot_rand_idx(plot_rand_idx, num_iters) engine = LE.LocalEngine(seed) M_r = du.gen_M_r_from_T(T) X_L, X_D = engine.initialize( M_c, M_r, T, 'from_the_prior', ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID, MU_GRID=MU_GRID, N_GRID=N_GRID, ) diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns) diagnostics_data = collections.defaultdict(list) for idx in range(num_iters): M_c, T, X_L, X_D = run_posterior_chain_iter( engine, M_c, T, X_L, X_D, diagnostics_data, diagnostics_funcs, ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID, S_GRID, MU_GRID, N_GRID=N_GRID, CT_KERNEL=CT_KERNEL, ) if idx == plot_rand_idx: # This DOESN'T work with multithreading filename = 'T_%s' % idx pu.plot_views(numpy.array(T), X_D, X_L, M_c, filename=filename, dir='./', close=True, format=image_format) pass pass return diagnostics_data
# create the data if True: T, M_r, M_c = du.gen_factorial_data_objects( gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=max_mean, max_std=max_std, ) else: with open('SynData2.csv') as fh: import numpy import csv T = numpy.array([ row for row in csv.reader(fh) ], dtype=float).tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) # create the state p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed) p_State.plot_T(filename='T') # transition the sampler print("p_State.get_marginal_logp():", p_State.get_marginal_logp()) for transition_idx in range(num_transitions): print("transition #: %s" % transition_idx) p_State.transition() counts = [ view_state['row_partition_model']['counts'] for view_state in p_State.get_X_L()['view_state']
def test_kl_divergence_as_a_function_of_N_and_transitions(): n_clusters = 3 n_chains = 8 do_times = 4 # N_list = [25, 50, 100, 250, 500, 1000, 2000] N_list = [25, 50, 100, 175, 250, 400, 500] # max_transitions = 500 max_transitions = 500 transition_interval = 50 t_iterations = max_transitions / transition_interval cctype = "continuous" cluster_weights = [1.0 / float(n_clusters)] * n_clusters separation = 0.5 get_next_seed = lambda: random.randrange(2147483647) # data grid KLD = numpy.zeros((len(N_list), t_iterations + 1)) for _ in range(do_times): for n in range(len(N_list)): N = N_list[n] T, M_c, struc = sdg.gen_data( [cctype], N, [0], [cluster_weights], [separation], seed=get_next_seed(), distargs=[None], return_structure=True, ) M_r = du.gen_M_r_from_T(T) # precompute the support and pdf to speed up calculation of KL divergence support = qtu.get_mixture_support( cctype, ccmext.p_ContinuousComponentModel, struc["component_params"][0], nbins=1000, support=0.995 ) true_log_pdf = qtu.get_mixture_pdf( support, ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights ) # intialize a multiprocessing engine mstate = mpe.MultiprocessingEngine(cpu_count=8) X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains) # kl_divergences klds = numpy.zeros(len(X_L_list)) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n, 0] += qtu.KL_divergence( ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf, ) # run transition_interval then take a reading. Rinse and repeat. for t in range(t_iterations): X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n, t + 1] += qtu.KL_divergence( ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf, ) KLD /= float(n_chains * do_times) pylab.subplot(1, 3, 1) pylab.contourf(list(range(0, max_transitions + 1, transition_interval)), N_list, KLD) pylab.title("KL divergence") pylab.ylabel("N") pylab.xlabel("# transitions") pylab.subplot(1, 3, 2) m_N = numpy.mean(KLD, axis=1) e_N = numpy.std(KLD, axis=1) / float(KLD.shape[1]) ** -0.5 pylab.errorbar(N_list, m_N, yerr=e_N) pylab.title("KL divergence by N") pylab.xlabel("N") pylab.ylabel("KL divergence") pylab.subplot(1, 3, 3) m_t = numpy.mean(KLD, axis=0) e_t = numpy.std(KLD, axis=0) / float(KLD.shape[0]) ** -0.5 pylab.errorbar(list(range(0, max_transitions + 1, transition_interval)), m_t, yerr=e_t) pylab.title("KL divergence by transitions") pylab.xlabel("trasition") pylab.ylabel("KL divergence") pylab.show() return KLD
def generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=0): # assert (num_clusters <= num_rows) assert (num_views <= num_cols) T = numpy.zeros((num_rows, num_cols)) random.seed(seed) numpy.random.seed(seed=seed) get_next_seed = lambda: random.randrange(2147483647) # generate an assignment of columns to views (uniform) cols_to_views = range(num_views) view_counts = numpy.ones(num_views, dtype=int) for i in range(num_views, num_cols): r = random.randrange(num_views) cols_to_views.append(r) view_counts[r] += 1 random.shuffle(cols_to_views) assert (len(cols_to_views) == num_cols) assert (max(cols_to_views) == num_views - 1) # for each view, generate an assignment of rows to num_clusters row_to_clusters = [] cluster_counts = [] for view in range(num_views): row_to_cluster = range(num_clusters) cluster_counts_i = numpy.ones(num_clusters, dtype=int) for i in range(num_clusters, num_rows): r = random.randrange(num_clusters) row_to_cluster.append(r) cluster_counts_i[r] += 1 random.shuffle(row_to_cluster) assert (len(row_to_cluster) == num_rows) assert (max(row_to_cluster) == num_clusters - 1) row_to_clusters.append(row_to_cluster) cluster_counts.append(cluster_counts_i) assert (len(row_to_clusters) == num_views) # generate the correlated data for view in range(num_views): for cluster in range(num_clusters): cell_cols = view_counts[view] cell_rows = cluster_counts[view][cluster] means = numpy.random.uniform(-mean_range / 2.0, mean_range / 2.0, cell_cols) X = generate_correlated_data(cell_rows, cell_cols, means, corr, seed=get_next_seed()) # get the indices of the columns in this view col_indices = numpy.nonzero(numpy.array(cols_to_views) == view)[0] # get the indices of the rows in this view and this cluster row_indices = numpy.nonzero( numpy.array(row_to_clusters[view]) == cluster)[0] # insert the data for col in range(cell_cols): for row in range(cell_rows): r = row_indices[row] c = col_indices[col] T[r, c] = X[row, col] M_c = du.gen_M_c_from_T(T) M_r = du.gen_M_r_from_T(T) X_L, X_D = generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=get_next_seed()) return T, M_c, M_r, X_L, X_D, cols_to_views
def generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=0): # assert(num_clusters <= num_rows) assert(num_views <= num_cols) T = numpy.zeros((num_rows, num_cols)) random.seed(seed) numpy.random.seed(seed=seed) get_next_seed = lambda : random.randrange(2147483647) # generate an assignment of columns to views (uniform) cols_to_views = range(num_views) view_counts = numpy.ones(num_views, dtype=int) for i in range(num_views, num_cols): r = random.randrange(num_views) cols_to_views.append(r) view_counts[r] += 1 random.shuffle(cols_to_views) assert(len(cols_to_views) == num_cols) assert(max(cols_to_views) == num_views-1) # for each view, generate an assignment of rows to num_clusters row_to_clusters = [] cluster_counts = [] for view in range(num_views): row_to_cluster = range(num_clusters) cluster_counts_i = numpy.ones(num_clusters,dtype=int) for i in range(num_clusters, num_rows): r = random.randrange(num_clusters) row_to_cluster.append(r) cluster_counts_i[r] += 1 random.shuffle(row_to_cluster) assert(len(row_to_cluster) == num_rows) assert(max(row_to_cluster) == num_clusters-1) row_to_clusters.append(row_to_cluster) cluster_counts.append(cluster_counts_i) assert(len(row_to_clusters) == num_views) # generate the correlated data for view in range(num_views): for cluster in range(num_clusters): cell_cols = view_counts[view] cell_rows = cluster_counts[view][cluster] means = numpy.random.uniform(-mean_range/2.0,mean_range/2.0,cell_cols) X = generate_correlated_data(cell_rows, cell_cols, means, corr, seed=get_next_seed()) # get the indices of the columns in this view col_indices = numpy.nonzero(numpy.array(cols_to_views)==view)[0] # get the indices of the rows in this view and this cluster row_indices = numpy.nonzero(numpy.array(row_to_clusters[view])==cluster)[0] # insert the data for col in range(cell_cols): for row in range(cell_rows): r = row_indices[row] c = col_indices[col] T[r,c] = X[row,col] M_c = du.gen_M_c_from_T(T) M_r = du.gen_M_r_from_T(T) X_L, X_D = generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=get_next_seed()) return T, M_c, M_r, X_L, X_D, cols_to_views
def test_kl_divergence_as_a_function_of_N_and_transitions(): n_clusters = 3 n_chains = 8 do_times = 4 # N_list = [25, 50, 100, 250, 500, 1000, 2000] N_list = [25, 50, 100, 175, 250, 400, 500] # max_transitions = 500 max_transitions = 500 transition_interval = 50 t_iterations = max_transitions/transition_interval cctype = 'continuous' cluster_weights = [1.0/float(n_clusters)]*n_clusters separation = .5 get_next_seed = lambda : random.randrange(2147483647) # data grid KLD = numpy.zeros((len(N_list), t_iterations+1)) for _ in range(do_times): for n in range(len(N_list)): N = N_list[n] T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], [separation], seed=get_next_seed(), distargs=[None], return_structure=True) M_r = du.gen_M_r_from_T(T) # precompute the support and pdf to speed up calculation of KL divergence support = qtu.get_mixture_support(cctype, ccmext.p_ContinuousComponentModel, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, ccmext.p_ContinuousComponentModel, struc['component_params'][0],cluster_weights) # intialize a multiprocessing engine mstate = mpe.MultiprocessingEngine(cpu_count=8) X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains) # kl_divergences klds = numpy.zeros(len(X_L_list)) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) # run transition_interval then take a reading. Rinse and repeat. for t in range( t_iterations ): X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) KLD /= float(n_chains*do_times) pylab.subplot(1,3,1) pylab.contourf(list(range(0,max_transitions+1,transition_interval), N_list, KLD)) pylab.title('KL divergence') pylab.ylabel('N') pylab.xlabel('# transitions') pylab.subplot(1,3,2) m_N = numpy.mean(KLD,axis=1) e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5 pylab.errorbar(N_list, m_N, yerr=e_N) pylab.title('KL divergence by N') pylab.xlabel('N') pylab.ylabel('KL divergence') pylab.subplot(1,3,3) m_t = numpy.mean(KLD,axis=0) e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5 pylab.errorbar(list(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t)) pylab.title('KL divergence by transitions') pylab.xlabel('trasition') pylab.ylabel('KL divergence') pylab.show() return KLD