def _do_simple_predictive_sample(M_c, X_L, X_D, Y, Q, n, get_next_seed): is_multistate = su.get_is_multistate(X_L, X_D) if is_multistate: samples = su.simple_predictive_sample_multistate(M_c, X_L, X_D, Y, Q, get_next_seed, n) else: samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n) return samples
def create_test_set(M_c, T, X_L, X_D, n_test, seed_seed=0): sample_row_idx = len(T) + 1 n_cols = len(T[0]) Y = [] Q = [(sample_row_idx, col_idx) for col_idx in range(n_cols)] int_generator = gu.int_generator(seed_seed) get_next_seed = lambda: int_generator.next() samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n_test) return samples
def run_test_continuous(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows, 1) state = State.p_State(M_c, T) T_array = numpy.array(T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # no constraints # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X_array = numpy.sort(numpy.array(samples)) std_X = numpy.std(X_array) mean_X = numpy.mean(X_array) # filter out extreme values X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0] X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0] X_filter = numpy.hstack((X_filter_low, X_filter_high)) X_array = numpy.delete(X_array, X_filter) # sort for area calculation later on X_array = numpy.sort(X_array) X = X_array.tolist() # build the queries Qs = [] for x in X: Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp( su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) # test that the area under Ps2 and pdfs is about 1 # calculated using the trapezoid rule area_density = 0 for i in range(len(X) - 1): area_density += (X[i + 1] - X[i]) * (densities[i + 1] + densities[i]) / 2.0 print("Area of PDF (should be close to, but not greater than, 1): " + str(area_density)) print( "*Note: The area will be less than one because the range (integral) is truncated." ) pylab.figure(facecolor='white') # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) pdf, bins, patches = pylab.hist(X, 100, normed=1, histtype='stepfilled', label='samples', alpha=.5, color=[.5, .5, .5]) pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left', fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_', suffix='.png', dir='.') pylab.savefig(fig_filename)
def run_test_multinomial(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = generate_multinomial_data(get_next_seed(), 2, n_rows, 1) state = State.p_State(M_c, T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X_array = numpy.sort(numpy.array(samples)) X = numpy.unique(X_array) X = X.tolist() # build the queries Qs = [] for x in X: # Qtmp = (query_row, query_column, x[0]) Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp( su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) print("Sum of densities (should be 1): %f" % (numpy.sum(densities))) pylab.clf() # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) mbins = numpy.unique(X_array) mbins = numpy.append(mbins, max(mbins) + 1) pdf, bins = numpy.histogram(X_array, mbins) pdf = pdf / float(numpy.sum(pdf)) pylab.bar(mbins[0:-1], pdf, label="samples", alpha=.5) pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left', fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() fd, fig_filename = tempfile.mkstemp(prefix='run_test_multinomial_', suffix='.png', dir='.') pylab.savefig(fig_filename)
def check_one_feature_sampler(component_model_type, show_plot=False): """ Tests the ability of component model of component_model_type to capture the distribution of the data. 1. Draws 100 random points from a standard normal distribution 2. Initializes a component model with that data (and random hyperparameters) 3. Draws data from that component model 4. Initialize a crosscat state with that data 5. Get one sample after 100 transitions 6. Draw predictive samples 7. Caluclates the 95 precent support of the continuous distribution or the entire support of the discrete distribution 8. Calculate the true pdf for each point in the support 9. Calculate the predictive probability given the sample for each point in the support 10. (OPTIONAL) Plot the original data, predictive samples, pdf, and predictive probabilities 11. Calculate goodness of fit stats (returns p value) """ N = 250 get_next_seed = lambda : random.randrange(2147483647) data_params = default_data_parameters[component_model_type.model_type] X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed()) hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0] component_model = component_model_type.from_data(X, hyperparameters) model_parameters = component_model.sample_parameters_given_hyper() # generate data from the parameters T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed()) # create a crosscat state M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype]) state = State.p_State(M_c, T) # transitions n_transitions = 100 state.transition(n_steps=n_transitions) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1) # get support discrete_support = component_model_type.generate_discrete_support(model_parameters) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,) T = numpy.array(T) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: T_hist, edges = numpy.histogram(T, bins=len(discrete_support)) S_hist, _ = numpy.histogram(predictive_samples, bins=edges) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data') pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples') # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), model_parameters)), c="blue", s=100, label="true pdf", alpha=1) # pylab.ylim([0,2]) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", s=100, label="predictive probability", alpha=1) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \ % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_single.png" pylab.savefig(filename) pylab.close() return p
def predictive_columns(M_c, X_L, X_D, columns_list, optional_settings=False, seed=0): """ Generates rows of data from the inferred distributions Inputs: - M_c: crosscat metadata (See documentation) - X_L: crosscat metadata (See documentation) - X_D: crosscat metadata (See documentation) - columns_list: a list of columns to sample - optinal_settings: list of dicts of optional arguments. Each column in columns_list should have its own list entry which is either None or a dict with possible keys: - missing_data: Proportion missing data Returns: - a num_rows by len(columns_list) numpy array, where n_rows is the original number of rows in the crosscat table. """ # supported arguments for optional_settings supported_arguments = ['missing_data'] num_rows = len(X_D[0]) num_cols = len(M_c['column_metadata']) if not isinstance(columns_list, list): raise TypeError("columns_list should be a list") for col in columns_list: if not isinstance(col, int): raise TypeError("every entry in columns_list shuold be an integer") if col < 0 or col >= num_cols: raise ValueError("%i is not a valid column. Should be valid entries\ are 0-%i" % (col, num_cols)) if not isinstance(seed, int): raise TypeError("seed should be an int") if seed < 0: raise ValueError("seed should be positive") if optional_settings: if not isinstance(optional_settings, list): raise TypeError("optional_settings should be a list") for col_setting in optional_settings: if isinstance(col_setting, dict): for key, value in col_setting.iteritems(): if key not in supported_arguments: raise KeyError("Invalid key in optional_settings, '%s'" % key) else: optional_settings = [None]*len(columns_list) random.seed(seed) X = numpy.zeros((num_rows, len(columns_list))) get_next_seed = lambda : random.randrange(2147483647) for c in range(len(columns_list)): col = columns_list[c] for row in range(num_rows): X[row,c] = su.simple_predictive_sample(M_c, X_L, X_D, [], [(row,col)], get_next_seed, n=1)[0][0] # check if there are optional arguments if isinstance(optional_settings[c], dict): # missing data argument if has_key(optional_settings[c], 'missing_data'): proportion = optional_settings[c]['missing_data'] X = add_missing_data_to_column(X, c, proportion) assert X.shape[0] == num_rows assert X.shape[1] == len(columns_list) return X
p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments']) # quick test just to make sure things output what they're supposed to x = 0.0; query_row = len(row[0]) # tests unobserved # query_row = 3; # tests observed Q = [(query_row,0,x)] Y = [] # no contraints # Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q) n = 1000; samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X = [sample[0] for sample in samples] pylab.figure(facecolor='white') pdf, bins, patches = pylab.hist(X,50,normed=True, histtype='bar',label='samples',edgecolor='none') pylab.show() pdf_max = max(pdf) Qs = []; for i in range(n): Qtmp = (query_row,0,X[i]) Qs.append(Qtmp) Ps = su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)
X_D = save_dict['X_D'] # FIXME: test constraints # Y = [su.Bunch(index=2,value=2.3), su.Bunch(index=0,value=-4.)] Y = None # test simple_predictive_sample_observed views_replicating_samples_params = su.determine_replicating_samples_params( X_L, X_D) views_samples = [] for replicating_samples_params in views_replicating_samples_params: this_view_samples = [] for replicating_sample_params in replicating_samples_params: this_view_this_sample = su.simple_predictive_sample( M_c, X_L, X_D, get_next_seed=get_next_seed, **replicating_sample_params) this_view_samples.extend(this_view_this_sample) views_samples.append(this_view_samples) for view_idx, view_samples in enumerate(views_samples): data_array = numpy.array(view_samples) pu.plot_T(data_array) pylab.title('simple_predictive_sample observed, view %s on local' % view_idx) # test simple_predictive_sample_observed REMOTE # hostname = 'ec2-23-22-208-4.compute-1.amazonaws.com' URI = 'http://' + hostname + ':8007' method_name = 'simple_predictive_sample' #
random_state = numpy.random.RandomState(inf_seed) M_c = save_dict['M_c'] X_L = save_dict['X_L'] X_D = save_dict['X_D'] # FIXME: test constraints # Y = [su.Bunch(index=2,value=2.3), su.Bunch(index=0,value=-4.)] Y = None # test simple_predictive_sample_observed views_replicating_samples_params = su.determine_replicating_samples_params(X_L, X_D) views_samples = [] for replicating_samples_params in views_replicating_samples_params: this_view_samples = [] for replicating_sample_params in replicating_samples_params: this_view_this_sample = su.simple_predictive_sample( M_c, X_L, X_D, get_next_seed=get_next_seed, **replicating_sample_params) this_view_samples.extend(this_view_this_sample) views_samples.append(this_view_samples) for view_idx, view_samples in enumerate(views_samples): data_array = numpy.array(view_samples) pu.plot_T(data_array) pylab.title('simple_predictive_sample observed, view %s on local' % view_idx) # test simple_predictive_sample_observed REMOTE # hostname = 'ec2-23-22-208-4.compute-1.amazonaws.com' URI = 'http://' + hostname + ':8007' method_name = 'simple_predictive_sample' # views_samples = [] for replicating_samples_params in views_replicating_samples_params: this_view_samples = []
def predictive_columns(M_c, X_L, X_D, columns_list, optional_settings=False, seed=0): """ Generates rows of data from the inferred distributions Inputs: - M_c: crosscat metadata (See documentation) - X_L: crosscat metadata (See documentation) - X_D: crosscat metadata (See documentation) - columns_list: a list of columns to sample - optinal_settings: list of dicts of optional arguments. Each column in columns_list should have its own list entry which is either None or a dict with possible keys: - missing_data: Proportion missing data Returns: - a num_rows by len(columns_list) numpy array, where n_rows is the original number of rows in the crosscat table. """ # supported arguments for optional_settings supported_arguments = ['missing_data'] num_rows = len(X_D[0]) num_cols = len(M_c['column_metadata']) if not isinstance(columns_list, list): raise TypeError("columns_list should be a list") for col in columns_list: if not isinstance(col, int): raise TypeError("every entry in columns_list shuold be an integer") if col < 0 or col >= num_cols: raise ValueError( "%i is not a valid column. Should be valid entries\ are 0-%i" % (col, num_cols)) if not isinstance(seed, int): raise TypeError("seed should be an int") if seed < 0: raise ValueError("seed should be positive") if optional_settings: if not isinstance(optional_settings, list): raise TypeError("optional_settings should be a list") for col_setting in optional_settings: if isinstance(col_setting, dict): for key, value in six.iteritems(col_setting): if key not in supported_arguments: raise KeyError( "Invalid key in optional_settings, '%s'" % key) else: optional_settings = [None] * len(columns_list) random.seed(seed) X = numpy.zeros((num_rows, len(columns_list))) get_next_seed = lambda: random.randrange(2147483647) for c in range(len(columns_list)): col = columns_list[c] for row in range(num_rows): X[row, c] = su.simple_predictive_sample(M_c, X_L, X_D, [], [(row, col)], get_next_seed, n=1)[0][0] # check if there are optional arguments if isinstance(optional_settings[c], dict): # missing data argument if has_key(optional_settings[c], 'missing_data'): proportion = optional_settings[c]['missing_data'] X = add_missing_data_to_column(X, c, proportion) assert X.shape[0] == num_rows assert X.shape[1] == len(columns_list) return X
def run_test_continuous(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1) state = State.p_State(M_c, T) T_array = numpy.array(T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # no constraints # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X_array = numpy.sort(numpy.array(samples)) std_X = numpy.std(X_array) mean_X = numpy.mean(X_array) # filter out extreme values X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0] X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0] X_filter = numpy.hstack((X_filter_low, X_filter_high)) X_array = numpy.delete(X_array, X_filter) # sort for area calculation later on X_array = numpy.sort(X_array) X = X_array.tolist() # build the queries Qs = []; for x in X: Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) # test that the area under Ps2 and pdfs is about 1 # calculated using the trapezoid rule area_density = 0; for i in range(len(X)-1): area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0 print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density) print "*Note: The area will be less than one because the range (integral) is truncated." pylab.figure(facecolor='white') # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5]) pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left',fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() raw_input("Press Enter when finished...")
def run_test_multinomial(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = generate_multinomial_data(get_next_seed(),2,n_rows,1) state = State.p_State(M_c, T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X_array = numpy.sort(numpy.array(samples)) X = numpy.unique(X_array) X = X.tolist() # build the queries Qs = []; for x in X: # Qtmp = (query_row, query_column, x[0]) Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) print "Sum of densities (should be 1): %f" % (numpy.sum(densities)) pylab.clf() # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) mbins = numpy.unique(X_array) mbins = numpy.append(mbins,max(mbins)+1) pdf, bins = numpy.histogram(X_array,mbins) pdf = pdf/float(numpy.sum(pdf)) pylab.bar(mbins[0:-1],pdf,label="samples",alpha=.5) pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left',fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() raw_input("Press Enter when finished...")
'column_partition_assignments', 'row_partition_assignments' ]) # quick test just to make sure things output what they're supposed to x = 0.0 query_row = len(row[0]) # tests unobserved # query_row = 3; # tests observed Q = [(query_row, 0, x)] Y = [] # no contraints # Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q) n = 1000 samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X = [sample[0] for sample in samples] pylab.figure(facecolor='white') pdf, bins, patches = pylab.hist(X, 50, normed=True, histtype='bar', label='samples', edgecolor='none') pylab.show() pdf_max = max(pdf) Qs = []
def test_one_feature_sampler(component_model_type, show_plot=False): """ Tests the ability of component model of component_model_type to capture the distribution of the data. 1. Draws 100 random points from a standard normal distribution 2. Initializes a component model with that data (and random hyperparameters) 3. Draws data from that component model 4. Initialize a crosscat state with that data 5. Get one sample after 100 transitions 6. Draw predictive samples 7. Caluclates the 95 precent support of the continuous distribution or the entire support of the discrete distribution 8. Calculate the true pdf for each point in the support 9. Calculate the predictive probability given the sample for each point in the support 10. (OPTIONAL) Plot the original data, predictive samples, pdf, and predictive probabilities 11. Calculate goodness of fit stats (returns p value) """ N = 250 get_next_seed = lambda: random.randrange(2147483647) data_params = default_data_parameters[component_model_type.model_type] X = component_model_type.generate_data_from_parameters( data_params, N, gen_seed=get_next_seed()) hyperparameters = component_model_type.draw_hyperparameters(X)[0] component_model = component_model_type.from_data(X, hyperparameters) model_parameters = component_model.sample_parameters_given_hyper() # generate data from the parameters T = component_model_type.generate_data_from_parameters( model_parameters, N, gen_seed=get_next_seed()) # create a crosscat state M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype]) state = State.p_State(M_c, T) # transitions n_transitions = 100 state.transition(n_steps=n_transitions) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = numpy.array( su.simple_predictive_sample(M_c, X_L, X_D, [], [(N, 0)], get_next_seed, n=N)).flatten(1) # get support discrete_support = component_model_type.generate_discrete_support( model_parameters) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] probabilities = su.simple_predictive_probability( M_c, X_L, X_D, [] * len(Q), Q, ) T = numpy.array(T) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: T_hist, edges = numpy.histogram(T, bins=len(discrete_support)) S_hist, _ = numpy.histogram(predictive_samples, bins=edges) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data') pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples') # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp( component_model_type.log_pdf( numpy.array(discrete_support), model_parameters)), c="blue", s=100, label="true pdf", alpha=1) # pylab.ylim([0,2]) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", s=100, label="predictive probability", alpha=1) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \ % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_single.png" pylab.savefig(filename) pylab.close() return p