def GenDataFromPartitions(col_part, row_parts, mean_gen, std_gen, std_data, seed): n_cols = len(col_part) n_rows = row_parts.shape[1] rng = np.random.RandomState(seed) T = np.zeros((n_rows, n_cols)) for col in range(n_cols): view = col_part[col] row_part = row_parts[view, :] cats = max(row_part) + 1 for cat in range(cats): row_dex = np.nonzero(row_part == cat)[0] n_rows_cat = len(row_dex) mean = rng.normal(mean_gen, std_gen) X = rng.normal(mean, std_data, (n_rows_cat, 1)) i = 0 for row in row_dex: T[row, col] = X[i] i += 1 T = T.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) return T, M_r, M_c
def generate_multinomial_data(next_seed,n_cols,n_rows,n_views): # generate the partitions random.seed(next_seed) cols_to_views = [0 for _ in range(n_cols)] rows_in_views_to_cols = [] for view in range(n_views): partition = eu.CRP(n_rows,2.0) random.shuffle(partition) rows_in_views_to_cols.append(partition) # generate the data data = numpy.zeros((n_rows,n_cols),dtype=float) for col in range(n_cols): view = cols_to_views[col] for row in range(n_rows): cluster = rows_in_views_to_cols[view][row] data[row,col] = cluster T = data.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) T, M_c = du.convert_columns_to_multinomial(T, M_c, range(n_cols)) return T, M_r, M_c
def GenDataFromPartitions(col_part,row_parts,mean_gen,std_gen,std_data): n_cols = len(col_part) n_rows = row_parts.shape[1] seed = int(time()*100) np.random.seed(seed) T = np.zeros((n_rows,n_cols)) for col in range(n_cols): view = col_part[col] row_part = row_parts[view,:] cats = max(row_part)+1 for cat in range(cats): row_dex = np.nonzero(row_part==cat)[0] n_rows_cat = len(row_dex) mean = np.random.normal(mean_gen,std_gen) X = np.random.normal(mean,std_data,(n_rows_cat,1)) i = 0 for row in row_dex: T[row,col] = X[i] i += 1 T = T.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) return T, M_r, M_c
def generate_multinomial_data(next_seed, n_cols, n_rows, n_views): # generate the partitions random.seed(next_seed) cols_to_views = [0 for _ in range(n_cols)] rows_in_views_to_cols = [] for view in range(n_views): partition = eu.CRP(n_rows, 2.0) random.shuffle(partition) rows_in_views_to_cols.append(partition) # generate the data data = numpy.zeros((n_rows, n_cols), dtype=float) for col in range(n_cols): view = cols_to_views[col] for row in range(n_rows): cluster = rows_in_views_to_cols[view][row] data[row, col] = cluster T = data.tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) T, M_c = du.convert_columns_to_multinomial(T, M_c, list(range(n_cols))) return T, M_r, M_c
def do_test(which_plot, max_plots, n, burn_in, cc_samples, which_test, correlation=0, do_plot=False): if which_test is "correlated": X = correlated(correlation, n=n) elif which_test is "square": X = square(n=n) elif which_test is "ring": X = ring(n=n) elif which_test is "circle": X = circle(n=n) elif which_test is "diamond": X = diamond(n=n) elif which_test is "blob": X = correlated(0.0, n=n) elif which_test is "dots": X = four_dots(n=n) elif which_test is "mixed": X = numpy.vstack((correlated(.95, n=n / 2), correlated(0, n=n / 2))) get_next_seed = lambda: random.randrange(32000) # Build a state M_c = du.gen_M_c_from_T(X.tolist()) state = State.p_State(M_c, X.tolist()) X_Ls = [] X_Ds = [] # collect crosscat samples for _ in range(cc_samples): state = State.p_State(M_c, X.tolist()) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) SX = sample_data_from_crosscat(M_c, X_Ls, X_Ds, get_next_seed, n) if do_plot: pl.subplot(2, max_plots, which_plot) pl.scatter(X[:, 0], X[:, 1], c='blue', alpha=.5) pl.title("Original data") pl.subplot(2, max_plots, max_plots + which_plot) pl.scatter(SX[:, 0], SX[:, 1], c='red', alpha=.5) pl.title("Sampled data") pl.show return M_c, X_Ls, X_Ds
def plot(results, filename=None): n_samples = results['config']['n_samples'] samples = sorted(results['samples']) conf = results['conf'] X_L = results['X_L_list'][0] X_D = results['X_D_list'][0] hgrm, _ = np.histogram(X_D[0], len(set(X_D[0]))) max_mass_mode = np.argmax(hgrm) suffstats = X_L['view_state'][0]['column_component_suffstats'][0][max_mass_mode] counts = suffstats['N'] sum_x = suffstats['sum_x'] sum_x_sq = suffstats['sum_x_squared'] scale = counts/results['config']['n_samples'] component_model = ccm.p_ContinuousComponentModel( X_L['column_hypers'][0], counts, sum_x, sum_x_sq) plt.figure(facecolor='white') ax = plt.subplot(1, 2, 1) ax.hist(samples, min(31, int(n_samples/10)), normed=True, label='Samples', ec='none', fc='gray') T = [[x] for x in samples] M_c = du.gen_M_c_from_T(T, cctypes=['continuous']) xvals = np.linspace(np.min(samples), np.max(samples), 300) Q = [(n_samples, 0, x) for i, x in enumerate(xvals)] p = [su.simple_predictive_probability(M_c, X_L, X_D, [], [q]) for q in Q] p = np.array(p) ax.plot(xvals, np.exp(p), c='#bbbbbb', label='Predicitive probability', lw=3) p = [component_model.calc_element_predictive_logp(x) for x in xvals] ax.plot(xvals, np.exp(p)*scale, c='#222222', label='Summary mode', lw=3) plt.xlabel('Samples') plt.legend(loc=0) ax = plt.subplot(1, 2, 2) ax.bar([0, 1], [conf, 1.0-conf], fc='#333333', ec='none') ax.set_ylim([0, 1]) ax.set_xlim([-.25, 2]) ax.set_xticks([.5, 1.5]) plt.ylabel('Probability mass') ax.set_xticklabels(['Summary mode', 'All other modes']) if filename is None: plt.show() else: plt.savefig(filename)
def gen_data_crosscat(mode, T): # edit transition list according to all_transitions = [] M_c = du.gen_M_c_from_T(T, cctypes=['continuous'] * 2) state = State.p_state(M_c, T) if mode == 'crp_mixture': # fix the views X_D = state.get_X_D() X_L = state.get_X_L() X_D = [X_D[0]] X_L['column_partition']['assignments'] = [1, 1] state = State.p_state(M_c, T, X_L=X_L, X_D=X_D)
def gen_data_crosscat(mode, T): # edit transition list according to all_transitions = [] M_c = du.gen_M_c_from_T(T, cctypes=['continuous']*2) state = State.p_state(M_c, T) if mode == 'crp_mixture': # fix the views X_D = state.get_X_D(); X_L = state.get_X_L(); X_D = [X_D[0]] X_L['column_partition']['assignments'] = [1,1] state = State.p_state(M_c, T, X_L=X_L, X_D=X_D)
def do_test(which_plot, max_plots, n, burn_in, cc_samples, which_test, correlation=0, do_plot=False): if which_test is "correlated": X = correlated(correlation, n=n) elif which_test is "square": X = square(n=n) elif which_test is "ring": X = ring(n=n) elif which_test is "circle": X = circle(n=n) elif which_test is "diamond": X = diamond(n=n) elif which_test is "blob": X = correlated(0.0, n=n) elif which_test is "dots": X = four_dots(n=n) elif which_test is "mixed": X = numpy.vstack((correlated(0.95, n=n / 2), correlated(0, n=n / 2))) get_next_seed = lambda: random.randrange(32000) # Build a state M_c = du.gen_M_c_from_T(X.tolist()) state = State.p_State(M_c, X.tolist()) X_Ls = [] X_Ds = [] # collect crosscat samples for _ in range(cc_samples): state = State.p_State(M_c, X.tolist()) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) SX = sample_data_from_crosscat(M_c, X_Ls, X_Ds, get_next_seed, n) if do_plot: pl.subplot(2, max_plots, which_plot) pl.scatter(X[:, 0], X[:, 1], c="blue", alpha=0.5) pl.title("Original data") pl.subplot(2, max_plots, max_plots + which_plot) pl.scatter(SX[:, 0], SX[:, 1], c="red", alpha=0.5) pl.title("Sampled data") pl.show return M_c, X_Ls, X_Ds
pl.figure() burn_in = 400 mi_ests = numpy.zeros(len(widths)) datas = [] nr = 0 for w in widths: T, mi_est = gen_ring(n, w, SEED=get_next_seed()) datas.append(T) print "num_samples: %i, width: %f" % (n, w) M_c = du.gen_M_c_from_T(T, cctypes) X_Ls = [] X_Ds = [] for ns in range(n_samples): state = State.p_State(M_c, T) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0, 1)], n_samples=5000) data_d = numpy.transpose(MI)
def check_one_feature_sampler(component_model_type, show_plot=False): """ Tests the ability of component model of component_model_type to capture the distribution of the data. 1. Draws 100 random points from a standard normal distribution 2. Initializes a component model with that data (and random hyperparameters) 3. Draws data from that component model 4. Initialize a crosscat state with that data 5. Get one sample after 100 transitions 6. Draw predictive samples 7. Caluclates the 95 precent support of the continuous distribution or the entire support of the discrete distribution 8. Calculate the true pdf for each point in the support 9. Calculate the predictive probability given the sample for each point in the support 10. (OPTIONAL) Plot the original data, predictive samples, pdf, and predictive probabilities 11. Calculate goodness of fit stats (returns p value) """ N = 250 get_next_seed = lambda : random.randrange(2147483647) data_params = default_data_parameters[component_model_type.model_type] X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed()) hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0] component_model = component_model_type.from_data(X, hyperparameters) model_parameters = component_model.sample_parameters_given_hyper() # generate data from the parameters T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed()) # create a crosscat state M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype]) state = State.p_State(M_c, T) # transitions n_transitions = 100 state.transition(n_steps=n_transitions) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1) # get support discrete_support = component_model_type.generate_discrete_support(model_parameters) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,) T = numpy.array(T) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: T_hist, edges = numpy.histogram(T, bins=len(discrete_support)) S_hist, _ = numpy.histogram(predictive_samples, bins=edges) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data') pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples') # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), model_parameters)), c="blue", s=100, label="true pdf", alpha=1) # pylab.ylim([0,2]) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", s=100, label="predictive probability", alpha=1) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \ % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_single.png" pylab.savefig(filename) pylab.close() return p
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 1000 separation = .9 get_next_seed = lambda : random.randrange(2147483647) cluster_weights = [[1.0/float(num_clusters)]*num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T = numpy.array(T) T_list = T # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) # Get support over all component models discrete_support = qtu.get_mixture_support(cctype, component_model_type, structure['component_params'][0], nbins=500) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0/num_clusters]*num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) pylab.show() return p
# create the data if True: T, M_r, M_c = du.gen_factorial_data_objects( gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=max_mean, max_std=max_std, ) else: with open('SynData2.csv') as fh: import numpy import csv T = numpy.array([ row for row in csv.reader(fh) ], dtype=float).tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) # create the state p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed) p_State.plot_T(filename='T') # transition the sampler print("p_State.get_marginal_logp():", p_State.get_marginal_logp()) for transition_idx in range(num_transitions): print("transition #: %s" % transition_idx) p_State.transition() counts = [ view_state['row_partition_model']['counts'] for view_state in p_State.get_X_L()['view_state'] ]
def continuous_imputation_confidence(samples, imputed, column_component_suffstats_i, n_steps=100, n_chains=1, return_metadata=False): # XXX: the confidence in continuous imputation is "the probability that # there exists a unimodal summary" which is defined as the proportion of # probability mass in the largest mode of a DPMM inferred from the simulate # samples. We use crosscat on the samples for a given number of iterations, # then calculate the proportion of mass in the largest mode. # # NOTE: The definition of confidence and its implementation do not agree. # The probability of a unimodal summary is P(k=1|X), where k is the number # of components in some infinite mixture model. I would describe the # current implementation as "Is there a mode with sufficient enough mass # that we can ignore the other modes". If this second formulation is to be # used, it means that we need to not use the median of all the samples as # the imputed value, but the median of the samples of the summary mode, # because the summary (the imputed value) should come from the summary # mode. # # There are a lot of problems with this second formulation. # 0. SLOW. Like, for real. # 1. Non-deterministic. The answer will be different given the same # samples. # 2. Inaccurate. Approximate inference about approximate inferences. # In practice confidences on the sample samples could be significantly # different because the Gibbs sampler that underlies crosscat is # susceptible to getting stuck in local maximum. Of course, this could be # mitigated to some extent by using more chains, but things are slow # enough as it is. # 3. Confidence (interval) has a distinct meaning to the people who will # be using this software. A unimodal summary does not necessarily mean # that inferences are within an acceptable range. We are going to need to # be loud about this. Maybe there should be a notion of tolerance? # # An alternative: mutual predictive coverage # ------------------------------------------ # Divide the number of samples in the intersection of the 90% CI's of each # component model by the number of samples in the union of the 90% CI's of # each component model. from crosscat.cython_code import State # XXX: assumes samples somes in as a 1-D numpy.array or 1-D list num_samples = float(len(samples)) T = [[x] for x in samples] # XXX: This is a higly problematic consequence of the current definition of # confidence. If the number of samples is 1, then the confidence is always # 1 because there will be exactly 1 mode in the DPMM (recall the DPMM can # have, at maximum, as many modes at data points). I figure if we're going # to give a bad answer, we shoud give it quickly. if num_samples == 1: return 1.0 confs = [] tlist = [ 'column_hyperparameters', 'row_partition_hyperparameters', 'row_partition_assignments' ] M_c = du.gen_M_c_from_T(T, cctypes=['continuous']) if return_metadata: X_L_list = [] X_D_list = [] for _ in range(n_chains): ccstate = State.p_State(M_c, T) ccstate.transition(which_transitions=tlist, n_steps=n_steps) X_D = ccstate.get_X_D() assignment = X_D[0] num_cats = max(assignment) + 1 props = numpy.histogram(assignment, num_cats)[0] / num_samples confs.append(max(props)) if return_metadata: X_L_list.append(ccstate.get_X_L()) X_D_list.append(X_D) conf = numpy.mean(confs) if return_metadata: return conf, X_L_list, X_D_list else: return conf
def generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=0): # assert(num_clusters <= num_rows) assert(num_views <= num_cols) T = numpy.zeros((num_rows, num_cols)) random.seed(seed) numpy.random.seed(seed=seed) get_next_seed = lambda : random.randrange(2147483647) # generate an assignment of columns to views (uniform) cols_to_views = range(num_views) view_counts = numpy.ones(num_views, dtype=int) for i in range(num_views, num_cols): r = random.randrange(num_views) cols_to_views.append(r) view_counts[r] += 1 random.shuffle(cols_to_views) assert(len(cols_to_views) == num_cols) assert(max(cols_to_views) == num_views-1) # for each view, generate an assignment of rows to num_clusters row_to_clusters = [] cluster_counts = [] for view in range(num_views): row_to_cluster = range(num_clusters) cluster_counts_i = numpy.ones(num_clusters,dtype=int) for i in range(num_clusters, num_rows): r = random.randrange(num_clusters) row_to_cluster.append(r) cluster_counts_i[r] += 1 random.shuffle(row_to_cluster) assert(len(row_to_cluster) == num_rows) assert(max(row_to_cluster) == num_clusters-1) row_to_clusters.append(row_to_cluster) cluster_counts.append(cluster_counts_i) assert(len(row_to_clusters) == num_views) # generate the correlated data for view in range(num_views): for cluster in range(num_clusters): cell_cols = view_counts[view] cell_rows = cluster_counts[view][cluster] means = numpy.random.uniform(-mean_range/2.0,mean_range/2.0,cell_cols) X = generate_correlated_data(cell_rows, cell_cols, means, corr, seed=get_next_seed()) # get the indices of the columns in this view col_indices = numpy.nonzero(numpy.array(cols_to_views)==view)[0] # get the indices of the rows in this view and this cluster row_indices = numpy.nonzero(numpy.array(row_to_clusters[view])==cluster)[0] # insert the data for col in range(cell_cols): for row in range(cell_rows): r = row_indices[row] c = col_indices[col] T[r,c] = X[row,col] M_c = du.gen_M_c_from_T(T) M_r = du.gen_M_r_from_T(T) X_L, X_D = generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=get_next_seed()) return T, M_c, M_r, X_L, X_D, cols_to_views
def test_one_feature_sampler(component_model_type, show_plot=False): """ Tests the ability of component model of component_model_type to capture the distribution of the data. 1. Draws 100 random points from a standard normal distribution 2. Initializes a component model with that data (and random hyperparameters) 3. Draws data from that component model 4. Initialize a crosscat state with that data 5. Get one sample after 100 transitions 6. Draw predictive samples 7. Caluclates the 95 precent support of the continuous distribution or the entire support of the discrete distribution 8. Calculate the true pdf for each point in the support 9. Calculate the predictive probability given the sample for each point in the support 10. (OPTIONAL) Plot the original data, predictive samples, pdf, and predictive probabilities 11. Calculate goodness of fit stats (returns p value) """ N = 250 get_next_seed = lambda: random.randrange(2147483647) data_params = default_data_parameters[component_model_type.model_type] X = component_model_type.generate_data_from_parameters( data_params, N, gen_seed=get_next_seed()) hyperparameters = component_model_type.draw_hyperparameters(X)[0] component_model = component_model_type.from_data(X, hyperparameters) model_parameters = component_model.sample_parameters_given_hyper() # generate data from the parameters T = component_model_type.generate_data_from_parameters( model_parameters, N, gen_seed=get_next_seed()) # create a crosscat state M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype]) state = State.p_State(M_c, T) # transitions n_transitions = 100 state.transition(n_steps=n_transitions) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = numpy.array( su.simple_predictive_sample(M_c, X_L, X_D, [], [(N, 0)], get_next_seed, n=N)).flatten(1) # get support discrete_support = component_model_type.generate_discrete_support( model_parameters) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] probabilities = su.simple_predictive_probability( M_c, X_L, X_D, [] * len(Q), Q, ) T = numpy.array(T) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: T_hist, edges = numpy.histogram(T, bins=len(discrete_support)) S_hist, _ = numpy.histogram(predictive_samples, bins=edges) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data') pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples') # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp( component_model_type.log_pdf( numpy.array(discrete_support), model_parameters)), c="blue", s=100, label="true pdf", alpha=1) # pylab.ylim([0,2]) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", s=100, label="predictive probability", alpha=1) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \ % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_single.png" pylab.savefig(filename) pylab.close() return p
def test_impute_vs_column_average_single(component_model_type, num_clusters, seed=0): """ tests predictive row generation vs column average Note: This test does not make sense for categorical data Inputs: - component_model_type: main class from datatype. Ex: ccmext.p_ContinuousComponentModel - num_clusters: the number of clusters in the data - seed: (optional) int to seed the RNG Returns: - the mean square error of the predictive sample column - the mean square error of the column average column """ random.seed(seed) N = 100 get_next_seed = lambda: random.randrange(2147483647) C = .9 # highly-separated clusters cctype = component_model_type.cctype component_model_parameters = sdg.generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype]) # generte a partition of rows to clusters (evenly-weighted) Z = range(num_clusters) for z in range(N - num_clusters): Z.append(random.randrange(num_clusters)) random.shuffle(Z) # generate the data T = numpy.array([[0]] * N, dtype=float) for x in range(N): z = Z[x] T[x] = component_model_type.generate_data_from_parameters( component_model_parameters[z], 1, gen_seed=get_next_seed())[0] T_list = T.tolist() # intialize the state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T) # transitions state.transition(n_steps=100) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate a row from the sample T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) # generate a row of column averages T_colave = numpy.ones(T.shape) * numpy.mean(T) # get the mean squared error err_sample = numpy.mean((T_generated - T)**2.0) err_colave = numpy.mean((T_colave - T)**2.0) return err_sample, err_colave
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p
def test_impute_vs_column_average_single(component_model_type, num_clusters, seed=0): """ tests predictive row generation vs column average Note: This test does not make sense for categorical data Inputs: - component_model_type: main class from datatype. Ex: ccmext.p_ContinuousComponentModel - num_clusters: the number of clusters in the data - seed: (optional) int to seed the RNG Returns: - the mean square error of the predictive sample column - the mean square error of the column average column """ random.seed(seed) N = 100 get_next_seed = lambda : random.randrange(2147483647) C = .9 # highly-separated clusters cctype = component_model_type.cctype component_model_parameters = sdg.generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype]) # generte a partition of rows to clusters (evenly-weighted) Z = range(num_clusters) for z in range(N-num_clusters): Z.append(random.randrange(num_clusters)) random.shuffle(Z) # generate the data T = numpy.array([[0]]*N, dtype=float) for x in range(N): z = Z[x] T[x] = component_model_type.generate_data_from_parameters( component_model_parameters[z], 1, gen_seed=get_next_seed())[0] T_list = T.tolist() # intialize the state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T) # transitions state.transition(n_steps=100) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate a row from the sample T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) # generate a row of column averages T_colave = numpy.ones(T.shape)*numpy.mean(T) # get the mean squared error err_sample = numpy.mean( (T_generated-T)**2.0 ) err_colave = numpy.mean( (T_colave-T)**2.0 ) return err_sample, err_colave
def run_test(args): rho = args.rho num_times = args.num_times min_num_rows = args.min_num_rows max_num_rows = args.max_num_rows n_grid = args.n_grid filename = args.filename discrete = args.discrete num_samples = [] for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist(): num_samples.append(int(ns)) variances = [] burn_in = 200 MIs = numpy.zeros((num_times, len(num_samples))) mi_diff = numpy.zeros((len(num_samples), num_times)) if not discrete: T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho) cctypes = ['continuous'] * 2 else: T, true_mi, external_mi = gen_correlated_data_discrete( num_samples[-1], rho) cctypes = ['multinomial'] * 2 data_subs = [] n_index = 0 for n in num_samples: T_sub = numpy.copy(T[0:n - 1, :]) data = [] data_subs.append(T_sub) print("%i: " % n) for t in range(num_times): M_c = du.gen_M_c_from_T(T_sub, cctypes) state = State.p_State(M_c, T_sub) state.transition(n_steps=burn_in) X_D = state.get_X_D() X_L = state.get_X_L() MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000) mi_diff[n_index, t] = true_mi - MI[0][0] print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0])) MIs[t, n_index] = MI[0][0] n_index += 1 if discrete: dtype_str = "discrete" else: dtype_str = "continuous" basefilename = filename + str(int(time.time())) figname = basefilename + ".png" datname = basefilename + "_DATA.png" pl.figure # plot data # pl.subplot(1,2,1) pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4)) i = 0 for T_s in data_subs: pl.subplot(1, len(data_subs), i + 1) num_rows = num_samples[i] if discrete: heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] pl.imshow(heatmap, extent=extent, interpolation="nearest") else: pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81) pl.title('#r: ' + str(num_rows)) i += 1 pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(datname) pl.clf() pl.figure(tight_layout=True, figsize=(5, 4)) # plot convergence # pl.subplot(1,2,2) # standard deviation stderr = numpy.std(MIs, axis=0) #/(float(num_times)**.5) mean = numpy.mean(MIs, axis=0) pl.errorbar(num_samples, mean, yerr=stderr, c='blue') pl.plot(num_samples, mean, c="blue", alpha=.8, label='mean MI') pl.plot(num_samples, [true_mi] * len(num_samples), color='red', alpha=.8, label='true MI') pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, .5, .5), alpha=.8, label='external MI') pl.title('convergence') pl.xlabel('#rows in X (log)') pl.ylabel('CrossCat MI - true MI') pl.legend(loc=0, prop={'size': 8}) pl.gca().set_xscale('log') # save output pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(figname)
def gen_data(cctypes, n_rows, cols_to_views, cluster_weights, separation, seed=0, distargs=None, return_structure=False): """ Generates a synthetic data. Inputs: - cctypes: List of strings. Each entry, i, is the cctype of the column i. ex: cctypes = ['continuous','continuous', 'multinomial'] - n_rows: integer. the number of rows - cols_to_views: List of integers. Each entry, i, is the view, v, to which columns i is assigned. v \in [0,...,n_cols-1]. ex: cols_to_views = [0, 0, 1] - cluster_weights: List of lists of floats. A num_views length list of list. Each sublist, W, is a list of cluster weights for the view, thus W should always sum to 1. ex (two views, first view has 2 clusters, second view has 3 clusters): cluster_weights = [[.3, .7], [.25, .5, .25]] - separation: list of floats. Each entry, i, is the separation, C, of the clusters in view i. C \in [0,1] where 0 is no separation and 1 is well-separated. ex (2 views): separation = [ .5, .7] - seed: optional - distargs: optional (only if continuous). distargs is n_columns length list where each entry is either None or a dict appropriate for the cctype in that column. For a normal feature, the entry should be None, for a multinomial feature, the entry should be a dict with the entry K (the number of categories). - return_structure: (bool, optional). Returns also a dict withe the data generation structure included. A dict with keys: - component_params: a n_cols length list of lists. Where each list is a set of component model parameters for each cluster in the view to which that column belongs - cols_to_views: a list assigning each column to a view - rows_to_clusters: a n_views length list of list. Each entry, rows_to_clusters[v][r] is the cluster to which all rows in columns belonging to view v are assigned Returns: T, M_c Example: >>> cctypes = ['continuous','continuous','multinomial','continuous','multinomial'] >>> disargs = [None, None, dict(K=5), None, dict(K=2)] >>> n_rows = 10 >>> cols_to_views = [0, 0, 1, 1, 2] >>> cluster_weights = [[.3, .7],[.5, .5],[.2, .3, .5]] >>> separation = [.9, .6, .9] >>> T, M_c = gen_data(cctypes, n_rows, cols_to_views, cluster_weights, separation, seed=0, distargs=distargs) """ # check Inputs if not isinstance(n_rows, int): raise TypeError("n_rows should be an integer") if not isinstance(cctypes, list): raise TypeError("cctypes should be a list") n_cols_cctypes = len(cctypes) for cctype in cctypes: if not isinstance(cctype, str): raise TypeError("cctypes should be a list of strings") # NOTE: will have to update when new component models are added if cctype not in ['continuous', 'multinomial', 'cyclic']: raise ValueError("invalid cctypein cctypes: %s." % cctype) if not isinstance(cols_to_views, list): raise TypeError("cols_to_views should be a list") if len(cols_to_views) != n_cols_cctypes: raise ValueError("number of columns in cctypes does not match number\ of columns in cols_to_views") if min(cols_to_views) != 0: raise ValueError("min value of cols_to_views should be 0") n_views_cols_to_views = max(cols_to_views) + 1 set_ctv = set(cols_to_views) if len(set_ctv) != n_views_cols_to_views: raise ValueError("View indices skipped in cols_to_views") # check cluster weights if not isinstance(cluster_weights, list): raise TypeError("cluster_weights should be a list") if n_views_cols_to_views != len(cluster_weights): raise ValueError("The number of views in cols_to_views and \ cluster_weights do not agree.") # check each set of weights for W in cluster_weights: if not isinstance(W, list): raise TypeError("cluster_weights should be a list of lists") if math.fabs(sum(W) - 1.0) > .0000001: raise ValueError("each vector of weights should sum to 1") if not isinstance(separation, list): raise TypeError("separation should be a list") if len(separation) != n_views_cols_to_views: raise ValueError( "number of view in separation and cols_to_views do not agree") for c in separation: if not isinstance(c, float) or c > 1.0 or c < 0.0: raise ValueError( "each value in separation should be a float from 0 to 1") num_views = len(separation) n_cols = len(cols_to_views) # check the cctypes vs the distargs if distargs is None: distargs = [None for i in range(n_cols)] if not isinstance(distargs, list): raise TypeError("distargs should be a list") if len(distargs) != n_cols: raise ValueError("distargs should have an entry for each column") for i in range(n_cols): if cctypes[i] == 'continuous' or cctypes[i] == 'cyclic': if distargs[i] is not None: raise ValueError( "distargs entry for 'continuous' cctype should be None") elif cctypes[i] == 'multinomial': if not isinstance(distargs[i], dict): raise TypeError( "ditargs for cctype 'multinomial' should be a dict") if len(distargs[i].keys()) != 1: raise KeyError( "distargs for cctype 'multinomial' should have one key, 'K'" ) if 'K' not in distargs[i].keys(): raise KeyError( "distargs for cctype 'multinomial' should have the key 'K'" ) else: raise ValueError("invalid cctypein cctypes: %s." % cctypes[i]) random.seed(seed) numpy.random.seed(seed) # Generate the rows to categories partitions (mutlinomial) rows_to_clusters = [] for W in cluster_weights: cW = list(W) for i in range(1, len(cW)): cW[i] += cW[i - 1] K = len(cW) rows_to_clusters_view = list(range(K)) for r in range(K, n_rows): rows_to_clusters_view.append(p_draw(cW)) random.shuffle(rows_to_clusters_view) assert len(rows_to_clusters_view) == n_rows rows_to_clusters.append(rows_to_clusters_view) get_next_seed = lambda: random.randrange(2147483647) # start generating the data data_table = numpy.zeros((n_rows, n_cols)) component_params = [] for col in range(n_cols): view = cols_to_views[col] # get the number of cluster in view num_clusters = len(cluster_weights[view]) cctype = cctypes[col] C = separation[view] # generate a set of C-separated component model parameters component_parameters = generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[col]) component_params.append(component_parameters) # get the data generation function gen = get_data_generator[cctype] for row in range(n_rows): # get the cluster this cluster = rows_to_clusters[view][row] params = component_parameters[cluster] x = gen(params, 1, gen_seed=get_next_seed())[0] data_table[row, col] = x T = data_table.tolist() M_c = du.gen_M_c_from_T(T, cctypes=cctypes) if return_structure: structure = dict() structure['component_params'] = component_params structure['cols_to_views'] = cols_to_views structure['rows_to_clusters'] = rows_to_clusters structure['cluster_weights'] = cluster_weights return T, M_c, structure else: return T, M_c
def generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=0): # assert (num_clusters <= num_rows) assert (num_views <= num_cols) T = numpy.zeros((num_rows, num_cols)) random.seed(seed) numpy.random.seed(seed=seed) get_next_seed = lambda: random.randrange(2147483647) # generate an assignment of columns to views (uniform) cols_to_views = range(num_views) view_counts = numpy.ones(num_views, dtype=int) for i in range(num_views, num_cols): r = random.randrange(num_views) cols_to_views.append(r) view_counts[r] += 1 random.shuffle(cols_to_views) assert (len(cols_to_views) == num_cols) assert (max(cols_to_views) == num_views - 1) # for each view, generate an assignment of rows to num_clusters row_to_clusters = [] cluster_counts = [] for view in range(num_views): row_to_cluster = range(num_clusters) cluster_counts_i = numpy.ones(num_clusters, dtype=int) for i in range(num_clusters, num_rows): r = random.randrange(num_clusters) row_to_cluster.append(r) cluster_counts_i[r] += 1 random.shuffle(row_to_cluster) assert (len(row_to_cluster) == num_rows) assert (max(row_to_cluster) == num_clusters - 1) row_to_clusters.append(row_to_cluster) cluster_counts.append(cluster_counts_i) assert (len(row_to_clusters) == num_views) # generate the correlated data for view in range(num_views): for cluster in range(num_clusters): cell_cols = view_counts[view] cell_rows = cluster_counts[view][cluster] means = numpy.random.uniform(-mean_range / 2.0, mean_range / 2.0, cell_cols) X = generate_correlated_data(cell_rows, cell_cols, means, corr, seed=get_next_seed()) # get the indices of the columns in this view col_indices = numpy.nonzero(numpy.array(cols_to_views) == view)[0] # get the indices of the rows in this view and this cluster row_indices = numpy.nonzero( numpy.array(row_to_clusters[view]) == cluster)[0] # insert the data for col in range(cell_cols): for row in range(cell_rows): r = row_indices[row] c = col_indices[col] T[r, c] = X[row, col] M_c = du.gen_M_c_from_T(T) M_r = du.gen_M_r_from_T(T) X_L, X_D = generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=get_next_seed()) return T, M_c, M_r, X_L, X_D, cols_to_views
pl.figure() burn_in = 400 mi_ests = numpy.zeros(len(widths)) datas = [] nr = 0 for w in widths: T, mi_est = gen_ring( n, w, SEED=get_next_seed()) datas.append(T) print("num_samples: %i, width: %f" % (n, w)) M_c = du.gen_M_c_from_T(T,cctypes) X_Ls = [] X_Ds = [] for ns in range(n_samples): state = State.p_State(M_c, T) state.transition(n_steps=burn_in) X_Ds.append(state.get_X_D()) X_Ls.append(state.get_X_L()) MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)], n_samples=5000) data_d = numpy.transpose(MI) if nr == 0: data = data_d
def continuous_imputation_confidence( samples, imputed, column_component_suffstats_i, n_steps=100, n_chains=1, return_metadata=False): # XXX: the confidence in continuous imputation is "the probability that # there exists a unimodal summary" which is defined as the proportion of # probability mass in the largest mode of a DPMM inferred from the simulate # samples. We use crosscat on the samples for a given number of iterations, # then calculate the proportion of mass in the largest mode. # # NOTE: The definition of confidence and its implementation do not agree. # The probability of a unimodal summary is P(k=1|X), where k is the number # of components in some infinite mixture model. I would describe the # current implementation as "Is there a mode with sufficient enough mass # that we can ignore the other modes". If this second formulation is to be # used, it means that we need to not use the median of all the samples as # the imputed value, but the median of the samples of the summary mode, # because the summary (the imputed value) should come from the summary # mode. # # There are a lot of problems with this second formulation. # 0. SLOW. Like, for real. # 1. Non-deterministic. The answer will be different given the same # samples. # 2. Inaccurate. Approximate inference about approximate inferences. # In practice confidences on the sample samples could be significantly # different because the Gibbs sampler that underlies crosscat is # susceptible to getting stuck in local maximum. Of course, this could be # mitigated to some extent by using more chains, but things are slow # enough as it is. # 3. Confidence (interval) has a distinct meaning to the people who will # be using this software. A unimodal summary does not necessarily mean # that inferences are within an acceptable range. We are going to need to # be loud about this. Maybe there should be a notion of tolerance? # # An alternative: mutual predictive coverage # ------------------------------------------ # Divide the number of samples in the intersection of the 90% CI's of each # component model by the number of samples in the union of the 90% CI's of # each component model. from crosscat.cython_code import State # XXX: assumes samples somes in as a 1-D numpy.array or 1-D list num_samples = float(len(samples)) T = [[x] for x in samples] # XXX: This is a higly problematic consequence of the current definition of # confidence. If the number of samples is 1, then the confidence is always # 1 because there will be exactly 1 mode in the DPMM (recall the DPMM can # have, at maximum, as many modes at data points). I figure if we're going # to give a bad answer, we shoud give it quickly. if num_samples == 1: return 1.0 confs = [] tlist = ['column_hyperparameters', 'row_partition_hyperparameters', 'row_partition_assignments'] M_c = du.gen_M_c_from_T(T, cctypes=['continuous']) if return_metadata: X_L_list = [] X_D_list = [] for _ in range(n_chains): ccstate = State.p_State(M_c, T) ccstate.transition(which_transitions=tlist, n_steps=n_steps) X_D = ccstate.get_X_D() assignment = X_D[0] num_cats = max(assignment)+1 props = numpy.histogram(assignment, num_cats)[0]/num_samples confs.append(max(props)) if return_metadata: X_L_list.append(ccstate.get_X_L()) X_D_list.append(X_D) conf = numpy.mean(confs) if return_metadata: return conf, X_L_list, X_D_list else: return conf
def gen_data(cctypes, n_rows, cols_to_views, cluster_weights, separation, seed=0, distargs=None, return_structure=False): """ Generates a synthetic data. Inputs: - cctypes: List of strings. Each entry, i, is the cctype of the column i. ex: cctypes = ['continuous','continuous', 'multinomial'] - n_rows: integer. the number of rows - cols_to_views: List of integers. Each entry, i, is the view, v, to which columns i is assigned. v \in [0,...,n_cols-1]. ex: cols_to_views = [0, 0, 1] - cluster_weights: List of lists of floats. A num_views length list of list. Each sublist, W, is a list of cluster weights for the view, thus W should always sum to 1. ex (two views, first view has 2 clusters, second view has 3 clusters): cluster_weights = [[.3, .7], [.25, .5, .25]] - separation: list of floats. Each entry, i, is the separation, C, of the clusters in view i. C \in [0,1] where 0 is no separation and 1 is well-separated. ex (2 views): separation = [ .5, .7] - seed: optional - distargs: optional (only if continuous). distargs is n_columns length list where each entry is either None or a dict appropriate for the cctype in that column. For a normal feature, the entry should be None, for a multinomial feature, the entry should be a dict with the entry K (the number of categories). - return_structure: (bool, optional). Returns also a dict withe the data generation structure included. A dict with keys: - component_params: a n_cols length list of lists. Where each list is a set of component model parameters for each cluster in the view to which that column belongs - cols_to_views: a list assigning each column to a view - rows_to_clusters: a n_views length list of list. Each entry, rows_to_clusters[v][r] is the cluster to which all rows in columns belonging to view v are assigned Returns: T, M_c Example: >>> cctypes = ['continuous','continuous','multinomial','continuous','multinomial'] >>> disargs = [None, None, dict(K=5), None, dict(K=2)] >>> n_rows = 10 >>> cols_to_views = [0, 0, 1, 1, 2] >>> cluster_weights = [[.3, .7],[.5, .5],[.2, .3, .5]] >>> separation = [.9, .6, .9] >>> T, M_c = gen_data(cctypes, n_rows, cols_to_views, cluster_weights, separation, seed=0, distargs=distargs) """ # check Inputs if not isinstance(n_rows, int): raise TypeError("n_rows should be an integer") if not isinstance(cctypes, list): raise TypeError("cctypes should be a list") n_cols_cctypes = len(cctypes) for cctype in cctypes: if not isinstance(cctype, str): raise TypeError("cctypes should be a list of strings") # NOTE: will have to update when new component models are added if cctype not in ['continuous', 'multinomial']: raise ValueError("invalid cctypein cctypes: %s." % cctype) if not isinstance(cols_to_views, list): raise TypeError("cols_to_views should be a list") if len(cols_to_views) != n_cols_cctypes: raise ValueError("number of columns in cctypes does not match number\ of columns in cols_to_views") if min(cols_to_views) != 0: raise ValueError("min value of cols_to_views should be 0") n_views_cols_to_views = max(cols_to_views) + 1 set_ctv = set(cols_to_views) if len(set_ctv) != n_views_cols_to_views: raise ValueError("View indices skipped in cols_to_views") # check cluster weights if not isinstance(cluster_weights, list): raise TypeError("cluster_weights should be a list") if n_views_cols_to_views != len(cluster_weights): raise ValueError("The number of views in cols_to_views and \ cluster_weights do not agree.") # check each set of weights for W in cluster_weights: if not isinstance(W, list): raise TypeError("cluster_weights should be a list of lists") if math.fabs(sum(W)-1.0) > .0000001: raise ValueError("each vector of weights should sum to 1") if not isinstance(separation, list): raise TypeError("separation should be a list") if len(separation) != n_views_cols_to_views: raise ValueError("number of view in separation and cols_to_views do not agree") for c in separation: if not isinstance(c, float) or c > 1.0 or c < 0.0: raise ValueError("each value in separation should be a float from 0 to 1") num_views = len(separation) n_cols = len(cols_to_views) # check the cctypes vs the distargs if distargs is None: distargs = [None for i in range(n_cols)] if not isinstance(distargs, list): raise TypeError("distargs should be a list") if len(distargs) != n_cols: raise ValueError("distargs should have an entry for each column") for i in range(n_cols): if cctypes[i] == 'continuous': if distargs[i] is not None: raise ValueError("distargs entry for 'continuous' cctype should be None") elif cctypes[i] == 'multinomial': if not isinstance(distargs[i], dict): raise TypeError("ditargs for cctype 'multinomial' should be a dict") if len(distargs[i].keys()) != 1: raise KeyError("distargs for cctype 'multinomial' should have one key, 'K'") if 'K' not in distargs[i].keys(): raise KeyError("distargs for cctype 'multinomial' should have the key 'K'") else: raise ValueError("invalid cctypein cctypes: %s." % cctypes[i]) random.seed(seed) numpy.random.seed(seed) # Generate the rows to categories partitions (mutlinomial) rows_to_clusters = [] for W in cluster_weights: cW = list(W) for i in range(1, len(cW)): cW[i] += cW[i-1] K = len(cW) rows_to_clusters_view = range(K) for r in range(K,n_rows): rows_to_clusters_view.append(p_draw(cW)) random.shuffle(rows_to_clusters_view) assert len(rows_to_clusters_view) == n_rows rows_to_clusters.append(rows_to_clusters_view) get_next_seed = lambda : random.randrange(2147483647) # start generating the data data_table = numpy.zeros((n_rows, n_cols)) component_params = [] for col in range(n_cols): view = cols_to_views[col] # get the number of cluster in view num_clusters = len(cluster_weights[view]) cctype = cctypes[col] C = separation[view] # generate a set of C-separated component model parameters component_parameters = generate_separated_model_parameters(cctype, C, num_clusters, get_next_seed, distargs=distargs[col]) component_params.append(component_parameters) # get the data generation function gen = get_data_generator[cctype] for row in range(n_rows): # get the cluster this cluster = rows_to_clusters[view][row] params = component_parameters[cluster] x = gen(params, 1, gen_seed=get_next_seed())[0] data_table[row,col] = x T = data_table.tolist() M_c = du.gen_M_c_from_T(T, cctypes=cctypes) if return_structure: structure = dict() structure['component_params'] = component_params structure['cols_to_views'] = cols_to_views structure['rows_to_clusters'] = rows_to_clusters return T, M_c, structure else: return T, M_c
def run_test(args): rho = args.rho num_times = args.num_times min_num_rows = args.min_num_rows max_num_rows = args.max_num_rows n_grid = args.n_grid filename = args.filename discrete = args.discrete num_samples = [] for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist(): num_samples.append(int(ns)) variances = [] burn_in = 200 MIs = numpy.zeros((num_times, len(num_samples))) mi_diff = numpy.zeros((len(num_samples), num_times)) if not discrete: T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho) cctypes = ["continuous"] * 2 else: T, true_mi, external_mi = gen_correlated_data_discrete(num_samples[-1], rho) cctypes = ["multinomial"] * 2 data_subs = [] n_index = 0 for n in num_samples: T_sub = numpy.copy(T[0 : n - 1, :]) data = [] data_subs.append(T_sub) print("%i: " % n) for t in range(num_times): M_c = du.gen_M_c_from_T(T_sub, cctypes) state = State.p_State(M_c, T_sub) state.transition(n_steps=burn_in) X_D = state.get_X_D() X_L = state.get_X_L() MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000) mi_diff[n_index, t] = true_mi - MI[0][0] print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0])) MIs[t, n_index] = MI[0][0] n_index += 1 if discrete: dtype_str = "discrete" else: dtype_str = "continuous" basefilename = filename + str(int(time.time())) figname = basefilename + ".png" datname = basefilename + "_DATA.png" pl.figure # plot data # pl.subplot(1,2,1) pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4)) i = 0 for T_s in data_subs: pl.subplot(1, len(data_subs), i + 1) num_rows = num_samples[i] if discrete: heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] pl.imshow(heatmap, extent=extent, interpolation="nearest") else: pl.scatter(T_s[:, 0], T_s[:, 1], alpha=0.3, s=81) pl.title("#r: " + str(num_rows)) i += 1 pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(datname) pl.clf() pl.figure(tight_layout=True, figsize=(5, 4)) # plot convergence # pl.subplot(1,2,2) # standard deviation stderr = numpy.std(MIs, axis=0) # /(float(num_times)**.5) mean = numpy.mean(MIs, axis=0) pl.errorbar(num_samples, mean, yerr=stderr, c="blue") pl.plot(num_samples, mean, c="blue", alpha=0.8, label="mean MI") pl.plot(num_samples, [true_mi] * len(num_samples), color="red", alpha=0.8, label="true MI") pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, 0.5, 0.5), alpha=0.8, label="external MI") pl.title("convergence") pl.xlabel("#rows in X (log)") pl.ylabel("CrossCat MI - true MI") pl.legend(loc=0, prop={"size": 8}) pl.gca().set_xscale("log") # save output pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str)) pl.savefig(figname)