def gen_data(filename, arsgin, save_csv=True): """ Generates a synthetic tablel with given properties. For full documentation see sdg.gen_data """ cctypes = arsgin['cctypes'] n_rows = arsgin['num_rows'] n_cols = arsgin['num_cols'] n_views = arsgin['num_views'] n_clusters = arsgin['num_clusters'] separation = arsgin['separation'] seed = arsgin['seed'] if 'distargs' in arsgin.keys(): distargs = arsgin['distargs'] else: distargs = None random.seed(seed) # need to generate cluster_weights and cols_to_views cols_to_views = range(n_views) for c in range(n_views, n_cols): cols_to_views.append(random.randrange(n_views)) cluster_weights = [] for v in range(n_views): cluster_weights.append([1./n_clusters]*n_clusters) T, _, structure = sdg.gen_data(cctypes, n_rows, cols_to_views, cluster_weights, separation, seed=seed, distargs=distargs, return_structure=True) T = numpy.array(T) if save_csv: header = [ 'col_'+str(col) for col in range(n_cols) ] # write the data to a list of list out = [header] for row in range(n_rows): row_out = [] for col in range(n_cols): if cctypes[col] == 'continuous': value = T[row][col] elif cctypes[col] == 'multinomial': value = int(T[row][col]) else: raise ValueError("unsupported cctype: %s" % cctypes[col]) row_out.append(value) out.append(row_out) list_to_csv(filename, out) return T, structure
def test_proper_set_up_all_continuous(self): T, M_c = sdg.gen_data(self.cctypes_all_contiuous, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=0, distargs=None) assert(len(T) == self.n_rows) assert(len(T[0]) == len(self.cols_to_views_good))
def test_proper_set_up_mixed(self): distargs = [ None, None, dict(K=5), None, dict(K=5)] T, M_c = sdg.gen_data(self.cctypes_mixed, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=0, distargs=distargs) assert(len(T) == self.n_rows) assert(len(T[0]) == len(self.cols_to_views_good))
def test_different_seeds_should_produce_the_different_data(self): distargs = [None]*5 T1, M_c = sdg.gen_data(self.cctypes_all_contiuous, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=0, distargs=distargs) T2, M_c = sdg.gen_data(self.cctypes_all_contiuous, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=12345, distargs=distargs) A1 = numpy.array(T1) A2 = numpy.array(T2) assert not numpy.all(A1==A2)
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] max_cols = argin["max_cols"] rho = argin["rho"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] multimodal = argin["multimodal"] separation = argin["separation"] all_cols = max_cols + 4 # max_cols plus number of dependent columns seed = argin["seed"] if seed > 0: random.seed(seed) numpy.random.seed(seed) # build full data file # generate column indices and header col_names = [ "col_%i" % i for i in range(all_cols)] Zv = [0,0,1,1] # our needles Zv.extend(range(2,all_cols-2)) min_clusters = 3 max_clusters = 10 T_array = numpy.zeros( (num_rows, all_cols) ) Sigma = numpy.array( [[1.0,rho],[rho,1.0]]) mu = numpy.array([0,0]) if multimodal: T = [[0]*num_cols]*num_rows Zv = [0,0,1,1] # our needles Zv.extend(range(2,num_cols-2)) random.shuffle(Zv) num_views = max(Zv)+1 separation = [separation]*2 separation.extend([separation]*(num_views-2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append( [1.0/num_clusters]*num_clusters ) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) T_array = numpy.array(T) else: T_array[:, 0:1+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows) T_array[:, 2:3+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows) separation = .5 for col in range(4, all_cols): num_clusters = random.randrange(min_clusters, max_clusters)+1 for row in range(num_rows): k = random.randrange(num_clusters) T_array[row, col] = numpy.random.randn()+k*6*separation T = T_array.tolist() # save file to .csv exp_path = 'expdata/hb/' eu.make_folder(exp_path) filename = exp_path + "haystack_break_exp.csv" table = "haystack_break_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # done building data file # get colum step size (powers of two) num_steps = int( math.log(max_cols, 2) )-1 step_size = [2**t for t in range(2, num_steps+1)] assert step_size[-1] <= max_cols if step_size[-1] < max_cols: step_size.append(max_cols) assert step_size[0] == 4 and step_size[-1] == max_cols # the needle column names needle_a_cols = (col_names[0],col_names[1]) needle_b_cols = (col_names[2],col_names[3]) result = dict() result['steps'] = [] for num_distractor_columns in step_size: # create subdata T_sub = take_T_column_subset(T, range(4+num_distractor_columns) ) subpath = exp_path+'d_'+str(num_distractor_columns)+'/' eu.make_folder(subpath) subfilename = subpath + "haystack_break_exp_" + str(num_distractor_columns) + ".csv" eu.list_to_csv(subfilename, T_sub) col_names_sub = T_sub[0] # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names_sub, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros( (num_iters+1, num_queries) ) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, subfilename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(0,num_iters+1): if i > 0: if ct_kernel == 1: client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table ) else: client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table ) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i,q] = out[0]['data'][0][1] subresult = dict() # store the queries in subresult subresult['query_col1'] = [] subresult['query_col2'] = [] subresult['dependence_probs'] = dependence_probs for pair in pairs: subresult['query_col1'].append(pair[0]) subresult['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True]*num_queries for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False subresult['cols_independent'] = independent subresult['distractor_cols'] = num_distractor_columns result['steps'].append(subresult) result['config'] = argin result['data'] = T_array return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] with_id = argin["with_id"] needles = argin["needles"] mixed_types = argin["mixed_types"] multinomial_categories = argin["multinomial_categories"] separation = argin["separation"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) # generate column indices and header col_names = [ "col_%i" % i for i in range(num_cols)] if mixed_types and multinomial_categories > 0: data_mode = 'mixed' elif multinomial_categories > 0: data_mode = 'multinomial' else: data_mode = 'continuous' if needles: T = [[0]*num_cols]*num_rows Zv = [0,0,1,1] # our needles Zv.extend(range(2,num_cols-2)) # random.shuffle(Zv) num_views = max(Zv)+1 separation = [.95]*2 separation.extend([0.0]*(num_views-2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: if independent_clusters: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append( [1.0/num_clusters]*num_clusters ) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) else: T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols) # # preprend the row_id # if with_id: # needle_a_cols = (1,2) # needle_b_cols = (3,4) # col_names.insert(0, 'ID') # # TODO: ID type # cctypes.insert(0,'continuous') # # header = "ID,%s" % header # if needles: # Zv.insert(0, num_views) # for row in range(num_rows): # T[row].insert(0, row) # else: needle_a_cols = (col_names[0],col_names[1]) needle_b_cols = (col_names[2],col_names[3]) # save file to .csv filename = "needles_exp.csv" table = "needles_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros( (num_iters, num_queries) ) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(num_iters): if ct_kernel == 1: client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table ) else: client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table ) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i,q] = out[0]['data'][0][1] result = dict() # store the queries in result result['query_col1'] = [] result['query_col2'] = [] result['dependence_probs'] = dependence_probs for pair in pairs: result['query_col1'].append(pair[0]) result['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True]*num_queries if needles: for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False result['cols_independent'] = independent result['config'] = argin result['config']['data_mode'] = data_mode client('SHOW DIAGNOSTICS FOR %s;' % table) return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] with_id = argin["with_id"] needles = argin["needles"] mixed_types = argin["mixed_types"] multinomial_categories = argin["multinomial_categories"] separation = argin["separation"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) # generate column indices and header col_names = ["col_%i" % i for i in range(num_cols)] if mixed_types and multinomial_categories > 0: data_mode = 'mixed' elif multinomial_categories > 0: data_mode = 'multinomial' else: data_mode = 'continuous' if needles: T = [[0] * num_cols] * num_rows Zv = [0, 0, 1, 1] # our needles Zv.extend(range(2, num_cols - 2)) # random.shuffle(Zv) num_views = max(Zv) + 1 separation = [.95] * 2 separation.extend([0.0] * (num_views - 2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: if independent_clusters: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append([1.0 / num_clusters] * num_clusters) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) else: T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols) # # preprend the row_id # if with_id: # needle_a_cols = (1,2) # needle_b_cols = (3,4) # col_names.insert(0, 'ID') # # TODO: ID type # cctypes.insert(0,'continuous') # # header = "ID,%s" % header # if needles: # Zv.insert(0, num_views) # for row in range(num_rows): # T[row].insert(0, row) # else: needle_a_cols = (col_names[0], col_names[1]) needle_b_cols = (col_names[2], col_names[3]) # save file to .csv filename = "needles_exp.csv" table = "needles_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros((num_iters, num_queries)) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(num_iters): if ct_kernel == 1: client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table) else: client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i, q] = out[0]['data'][0][1] result = dict() # store the queries in result result['query_col1'] = [] result['query_col2'] = [] result['dependence_probs'] = dependence_probs for pair in pairs: result['query_col1'].append(pair[0]) result['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True] * num_queries if needles: for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False result['cols_independent'] = independent result['config'] = argin result['config']['data_mode'] = data_mode client('SHOW DIAGNOSTICS FOR %s;' % table) return result
def test_predictive_sample_improvement(component_model_type, seed=0, show_plot=True): """ Shows the error of predictive sample over iterations. """ num_transitions = 100 num_samples = 10 num_clusters = 2 separation = .9 # cluster separation N = 150 random.seed(seed) get_next_seed = lambda : random.randrange(2147483647) # generate a single column of data from the component_model cctype = component_model_type.cctype T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_array = numpy.array(T) X = numpy.zeros((N,num_transitions)) KL = numpy.zeros((num_samples, num_transitions)) support = qtu.get_mixture_support(cctype, component_model_type, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, struc['component_params'][0],[.5,.5]) for s in range(num_samples): # generate the state state = State.p_State(M_c, T, SEED=get_next_seed()) for i in range(num_transitions): # transition state.transition() # get partitions and generate a predictive column X_L = state.get_X_L() X_D = state.get_X_D() T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) if cctype == 'multinomial': K = distargs[cctype]['K'] weights = numpy.zeros(numpy.array(K)) for params in struc['component_params'][0]: weights += numpy.array(params['weights'])*(1.0/num_clusters) weights *= float(N) inf_hist = qtu.bincount(T_inf, bins=range(K)) err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson') err = numpy.ones(N)*err else: err = (T_array-T_inf)**2.0 KL[s,i] = qtu.KL_divergence(component_model_type, struc['component_params'][0], [.5,.5], M_c, X_L, X_D, true_log_pdf=true_log_pdf, support=support) for j in range(N): X[j,i] += err[j] X /= num_samples # mean and standard error X_mean = numpy.mean(X,axis=0) X_err = numpy.std(X,axis=0)/float(num_samples)**.5 KL_mean = numpy.mean(KL, axis=0) KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5 if show_plot: pylab.subplot(1,2,1) pylab.errorbar(range(num_transitions), X_mean, yerr=X_err) pylab.xlabel('iteration') pylab.ylabel('error across each data point') pylab.title('error of predictive sample over iterations, N=%i' % N) pylab.subplot(1,2,2) pylab.errorbar(range(num_transitions), KL_mean, yerr=KL_err) pylab.xlabel('iteration') pylab.ylabel('KL divergence') pylab.title('KL divergence, N=%i' % N) pylab.show() # error should decrease over time return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 1000 separation = .9 get_next_seed = lambda : random.randrange(2147483647) cluster_weights = [[1.0/float(num_clusters)]*num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T = numpy.array(T) T_list = T # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) # Get support over all component models discrete_support = qtu.get_mixture_support(cctype, component_model_type, structure['component_params'][0], nbins=500) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0/num_clusters]*num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) pylab.show() return p
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p
def test_kl_divergence_as_a_function_of_N_and_transitions(): n_clusters = 3 n_chains = 8 do_times = 4 # N_list = [25, 50, 100, 250, 500, 1000, 2000] N_list = [25, 50, 100, 175, 250, 400, 500] # max_transitions = 500 max_transitions = 500 transition_interval = 50 t_iterations = max_transitions/transition_interval cctype = 'continuous' cluster_weights = [1.0/float(n_clusters)]*n_clusters separation = .5 get_next_seed = lambda : random.randrange(2147483647) # data grid KLD = numpy.zeros((len(N_list), t_iterations+1)) for _ in range(do_times): for n in range(len(N_list)): N = N_list[n] T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], [separation], seed=get_next_seed(), distargs=[None], return_structure=True) M_r = du.gen_M_r_from_T(T) # precompute the support and pdf to speed up calculation of KL divergence support = qtu.get_mixture_support(cctype, ccmext.p_ContinuousComponentModel, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, ccmext.p_ContinuousComponentModel, struc['component_params'][0],cluster_weights) # intialize a multiprocessing engine mstate = mpe.MultiprocessingEngine(cpu_count=8) X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains) # kl_divergences klds = numpy.zeros(len(X_L_list)) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) # run transition_interval then take a reading. Rinse and repeat. for t in range( t_iterations ): X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) KLD /= float(n_chains*do_times) pylab.subplot(1,3,1) pylab.contourf(range(0,max_transitions+1,transition_interval), N_list, KLD) pylab.title('KL divergence') pylab.ylabel('N') pylab.xlabel('# transitions') pylab.subplot(1,3,2) m_N = numpy.mean(KLD,axis=1) e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5 pylab.errorbar(N_list, m_N, yerr=e_N) pylab.title('KL divergence by N') pylab.xlabel('N') pylab.ylabel('KL divergence') pylab.subplot(1,3,3) m_t = numpy.mean(KLD,axis=0) e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5 pylab.errorbar(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t) pylab.title('KL divergence by transitions') pylab.xlabel('trasition') pylab.ylabel('KL divergence') pylab.show() return KLD