def KL_divergence(component_model_class, parameters_list, component_weights, M_c, X_L, X_D, n_samples=1000, true_log_pdf=None, support=None): """ FIXME: Add doc """ # FIXME: Add validation code cctype = component_model_class.cctype # get support (X) if support is None: support = get_mixture_support(cctype, component_model_class, parameters_list, nbins=n_samples, support=.995) elif not isinstance(support, numpy.ndarray): raise TypeError("support must be a numpy array (vector)") # get true pdf if true_log_pdf is None: true_log_pdf = get_mixture_pdf(support, component_model_class, parameters_list, component_weights) elif not isinstance(true_log_pdf, numpy.ndarray): raise TypeError("true_log_pdf should be a numpy array (vector)") row = len(X_D[0]) Q = [ (row,0,x) for x in support ] # get predictive probabilities pred_probs = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) kld = KL_divergence_arrays(support, pred_probs, true_log_pdf, is_discrete[cctype]) return float(kld)
def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q, epsilon=0.001): """Calculate the probability of a cell taking a value within epsilon of the specified values given a latent state :param M_c: The column metadata :type M_c: dict :param X_L: the latent variables associated with the latent state :type X_L: dict :param X_D: the particular cluster assignments of each row in each view :type X_D: list of lists :param Y: A list of constraints to apply when sampling. Each constraint is a triplet of (r, d, v): r is the row index, d is the column index and v is the value of the constraint :type Y: list of lists :param Q: A list of values to sample. Each value is doublet of (r, d): r is the row index, d is the column index :type Q: list of lists :param epsilon: the window around the specified value to take the delta in cdf of :type epsilon: float :returns: list of floats -- probabilities of the values specified by Q """ return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q, epsilon)
def plot(results, filename=None): n_samples = results['config']['n_samples'] samples = sorted(results['samples']) conf = results['conf'] X_L = results['X_L_list'][0] X_D = results['X_D_list'][0] hgrm, _ = np.histogram(X_D[0], len(set(X_D[0]))) max_mass_mode = np.argmax(hgrm) suffstats = X_L['view_state'][0]['column_component_suffstats'][0][max_mass_mode] counts = suffstats['N'] sum_x = suffstats['sum_x'] sum_x_sq = suffstats['sum_x_squared'] scale = counts/results['config']['n_samples'] component_model = ccm.p_ContinuousComponentModel( X_L['column_hypers'][0], counts, sum_x, sum_x_sq) plt.figure(facecolor='white') ax = plt.subplot(1, 2, 1) ax.hist(samples, min(31, int(n_samples/10)), normed=True, label='Samples', ec='none', fc='gray') T = [[x] for x in samples] M_c = du.gen_M_c_from_T(T, cctypes=['continuous']) xvals = np.linspace(np.min(samples), np.max(samples), 300) Q = [(n_samples, 0, x) for i, x in enumerate(xvals)] p = [su.simple_predictive_probability(M_c, X_L, X_D, [], [q]) for q in Q] p = np.array(p) ax.plot(xvals, np.exp(p), c='#bbbbbb', label='Predicitive probability', lw=3) p = [component_model.calc_element_predictive_logp(x) for x in xvals] ax.plot(xvals, np.exp(p)*scale, c='#222222', label='Summary mode', lw=3) plt.xlabel('Samples') plt.legend(loc=0) ax = plt.subplot(1, 2, 2) ax.bar([0, 1], [conf, 1.0-conf], fc='#333333', ec='none') ax.set_ylim([0, 1]) ax.set_xlim([-.25, 2]) ax.set_xticks([.5, 1.5]) plt.ylabel('Probability mass') ax.set_xticklabels(['Summary mode', 'All other modes']) if filename is None: plt.show() else: plt.savefig(filename)
def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q): """Calculate probability of a cell taking a value given a latent state. :param Y: A list of constraints to apply when querying. Each constraint is a triplet of (r, d, v): r is the row index, d is the column index and v is the value of the constraint :type Y: list of lists :param Q: A list of values to query. Each value is triplet of (r, d, v): r is the row index, d is the column index, and v is the value at which the density is evaluated. :type Q: list of lists :returns: list of floats -- probabilities of the values specified by Q """ return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)
def KL_divergence(component_model_class, parameters_list, component_weights, M_c, X_L, X_D, n_samples=1000, true_log_pdf=None, support=None): """ FIXME: Add doc """ # FIXME: Add validation code cctype = component_model_class.cctype # get support (X) if support is None: support = get_mixture_support(cctype, component_model_class, parameters_list, nbins=n_samples, support=.995) elif not isinstance(support, numpy.ndarray): raise TypeError("support must be a numpy array (vector)") # get true pdf if true_log_pdf is None: true_log_pdf = get_mixture_pdf(support, component_model_class, parameters_list, component_weights) elif not isinstance(true_log_pdf, numpy.ndarray): raise TypeError("true_log_pdf should be a numpy array (vector)") row = len(X_D[0]) Q = [(row, 0, x) for x in support] # get predictive probabilities pred_probs = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) kld = KL_divergence_arrays(support, pred_probs, true_log_pdf, is_discrete[cctype]) return float(kld)
def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q): """Calculate the probability of a cell taking a value given a latent state :param M_c: The column metadata :type M_c: dict :param X_L: the latent variables associated with the latent state :type X_L: dict :param X_D: the particular cluster assignments of each row in each view :type X_D: list of lists :param Y: A list of constraints to apply when sampling. Each constraint is a triplet of (r, d, v): r is the row index, d is the column index and v is the value of the constraint :type Y: list of lists :param Q: A list of values to sample. Each value is doublet of (r, d): r is the row index, d is the column index :type Q: list of lists :returns: list of floats -- probabilities of the values specified by Q """ return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)
def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q): """Calculate the probability of a cell taking a value given a latent state :param M_c: The column metadata :type M_c: dict :param X_L: the latent variables associated with the latent state :type X_L: dict :param X_D: the particular cluster assignments of each row in each view :type X_D: list of lists :param Y: A list of constraints to apply when querying. Each constraint is a triplet of (r, d, v): r is the row index, d is the column index and v is the value of the constraint :type Y: list of lists :param Q: A list of values to query. Each value is triplet of (r, d, v): r is the row index, d is the column index, and v is the value at which the density is evaluated. :type Q: list of lists :returns: list of floats -- probabilities of the values specified by Q """ return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)
def run_test_continuous(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows, 1) state = State.p_State(M_c, T) T_array = numpy.array(T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # no constraints # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X_array = numpy.sort(numpy.array(samples)) std_X = numpy.std(X_array) mean_X = numpy.mean(X_array) # filter out extreme values X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0] X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0] X_filter = numpy.hstack((X_filter_low, X_filter_high)) X_array = numpy.delete(X_array, X_filter) # sort for area calculation later on X_array = numpy.sort(X_array) X = X_array.tolist() # build the queries Qs = [] for x in X: Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp( su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) # test that the area under Ps2 and pdfs is about 1 # calculated using the trapezoid rule area_density = 0 for i in range(len(X) - 1): area_density += (X[i + 1] - X[i]) * (densities[i + 1] + densities[i]) / 2.0 print("Area of PDF (should be close to, but not greater than, 1): " + str(area_density)) print( "*Note: The area will be less than one because the range (integral) is truncated." ) pylab.figure(facecolor='white') # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) pdf, bins, patches = pylab.hist(X, 100, normed=1, histtype='stepfilled', label='samples', alpha=.5, color=[.5, .5, .5]) pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left', fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_', suffix='.png', dir='.') pylab.savefig(fig_filename)
def run_test_multinomial(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = generate_multinomial_data(get_next_seed(), 2, n_rows, 1) state = State.p_State(M_c, T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X_array = numpy.sort(numpy.array(samples)) X = numpy.unique(X_array) X = X.tolist() # build the queries Qs = [] for x in X: # Qtmp = (query_row, query_column, x[0]) Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp( su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) print("Sum of densities (should be 1): %f" % (numpy.sum(densities))) pylab.clf() # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) mbins = numpy.unique(X_array) mbins = numpy.append(mbins, max(mbins) + 1) pdf, bins = numpy.histogram(X_array, mbins) pdf = pdf / float(numpy.sum(pdf)) pylab.bar(mbins[0:-1], pdf, label="samples", alpha=.5) pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left', fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() fd, fig_filename = tempfile.mkstemp(prefix='run_test_multinomial_', suffix='.png', dir='.') pylab.savefig(fig_filename)
def check_one_feature_sampler(component_model_type, show_plot=False): """ Tests the ability of component model of component_model_type to capture the distribution of the data. 1. Draws 100 random points from a standard normal distribution 2. Initializes a component model with that data (and random hyperparameters) 3. Draws data from that component model 4. Initialize a crosscat state with that data 5. Get one sample after 100 transitions 6. Draw predictive samples 7. Caluclates the 95 precent support of the continuous distribution or the entire support of the discrete distribution 8. Calculate the true pdf for each point in the support 9. Calculate the predictive probability given the sample for each point in the support 10. (OPTIONAL) Plot the original data, predictive samples, pdf, and predictive probabilities 11. Calculate goodness of fit stats (returns p value) """ N = 250 get_next_seed = lambda : random.randrange(2147483647) data_params = default_data_parameters[component_model_type.model_type] X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed()) hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0] component_model = component_model_type.from_data(X, hyperparameters) model_parameters = component_model.sample_parameters_given_hyper() # generate data from the parameters T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed()) # create a crosscat state M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype]) state = State.p_State(M_c, T) # transitions n_transitions = 100 state.transition(n_steps=n_transitions) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1) # get support discrete_support = component_model_type.generate_discrete_support(model_parameters) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,) T = numpy.array(T) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: T_hist, edges = numpy.histogram(T, bins=len(discrete_support)) S_hist, _ = numpy.histogram(predictive_samples, bins=edges) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data') pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples') # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), model_parameters)), c="blue", s=100, label="true pdf", alpha=1) # pylab.ylim([0,2]) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", s=100, label="predictive probability", alpha=1) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \ % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_single.png" pylab.savefig(filename) pylab.close() return p
# move stuff around a little bit for i in range(100): p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments']) # quick test just to make sure things output what they're supposed to x = 0.0; query_row = len(row[0]) # tests unobserved # query_row = 3; # tests observed Q = [(query_row,0,x)] Y = [] # no contraints # Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q) n = 1000; samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X = [sample[0] for sample in samples] pylab.figure(facecolor='white') pdf, bins, patches = pylab.hist(X,50,normed=True, histtype='bar',label='samples',edgecolor='none') pylab.show() pdf_max = max(pdf) Qs = []; for i in range(n): Qtmp = (query_row,0,X[i])
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 1000 separation = .9 get_next_seed = lambda : random.randrange(2147483647) cluster_weights = [[1.0/float(num_clusters)]*num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T = numpy.array(T) T_list = T # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) # Get support over all component models discrete_support = qtu.get_mixture_support(cctype, component_model_type, structure['component_params'][0], nbins=500) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0/num_clusters]*num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) pylab.show() return p
def run_test_continuous(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1) state = State.p_State(M_c, T) T_array = numpy.array(T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # no constraints # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X_array = numpy.sort(numpy.array(samples)) std_X = numpy.std(X_array) mean_X = numpy.mean(X_array) # filter out extreme values X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0] X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0] X_filter = numpy.hstack((X_filter_low, X_filter_high)) X_array = numpy.delete(X_array, X_filter) # sort for area calculation later on X_array = numpy.sort(X_array) X = X_array.tolist() # build the queries Qs = []; for x in X: Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) # test that the area under Ps2 and pdfs is about 1 # calculated using the trapezoid rule area_density = 0; for i in range(len(X)-1): area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0 print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density) print "*Note: The area will be less than one because the range (integral) is truncated." pylab.figure(facecolor='white') # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5]) pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left',fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() raw_input("Press Enter when finished...")
def run_test_multinomial(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = generate_multinomial_data(get_next_seed(),2,n_rows,1) state = State.p_State(M_c, T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X_array = numpy.sort(numpy.array(samples)) X = numpy.unique(X_array) X = X.tolist() # build the queries Qs = []; for x in X: # Qtmp = (query_row, query_column, x[0]) Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) print "Sum of densities (should be 1): %f" % (numpy.sum(densities)) pylab.clf() # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) mbins = numpy.unique(X_array) mbins = numpy.append(mbins,max(mbins)+1) pdf, bins = numpy.histogram(X_array,mbins) pdf = pdf/float(numpy.sum(pdf)) pylab.bar(mbins[0:-1],pdf,label="samples",alpha=.5) pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left',fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() raw_input("Press Enter when finished...")
# move stuff around a little bit for i in range(100): p_State.transition(which_transitions=[ 'column_partition_assignments', 'row_partition_assignments' ]) # quick test just to make sure things output what they're supposed to x = 0.0 query_row = len(row[0]) # tests unobserved # query_row = 3; # tests observed Q = [(query_row, 0, x)] Y = [] # no contraints # Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q) n = 1000 samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X = [sample[0] for sample in samples] pylab.figure(facecolor='white') pdf, bins, patches = pylab.hist(X, 50, normed=True, histtype='bar', label='samples', edgecolor='none') pylab.show()
def test_one_feature_sampler(component_model_type, show_plot=False): """ Tests the ability of component model of component_model_type to capture the distribution of the data. 1. Draws 100 random points from a standard normal distribution 2. Initializes a component model with that data (and random hyperparameters) 3. Draws data from that component model 4. Initialize a crosscat state with that data 5. Get one sample after 100 transitions 6. Draw predictive samples 7. Caluclates the 95 precent support of the continuous distribution or the entire support of the discrete distribution 8. Calculate the true pdf for each point in the support 9. Calculate the predictive probability given the sample for each point in the support 10. (OPTIONAL) Plot the original data, predictive samples, pdf, and predictive probabilities 11. Calculate goodness of fit stats (returns p value) """ N = 250 get_next_seed = lambda: random.randrange(2147483647) data_params = default_data_parameters[component_model_type.model_type] X = component_model_type.generate_data_from_parameters( data_params, N, gen_seed=get_next_seed()) hyperparameters = component_model_type.draw_hyperparameters(X)[0] component_model = component_model_type.from_data(X, hyperparameters) model_parameters = component_model.sample_parameters_given_hyper() # generate data from the parameters T = component_model_type.generate_data_from_parameters( model_parameters, N, gen_seed=get_next_seed()) # create a crosscat state M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype]) state = State.p_State(M_c, T) # transitions n_transitions = 100 state.transition(n_steps=n_transitions) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = numpy.array( su.simple_predictive_sample(M_c, X_L, X_D, [], [(N, 0)], get_next_seed, n=N)).flatten(1) # get support discrete_support = component_model_type.generate_discrete_support( model_parameters) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] probabilities = su.simple_predictive_probability( M_c, X_L, X_D, [] * len(Q), Q, ) T = numpy.array(T) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. if is_discrete[component_model_type.model_type]: T_hist, edges = numpy.histogram(T, bins=len(discrete_support)) S_hist, _ = numpy.histogram(predictive_samples, bins=edges) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(20, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data') pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples') # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp( component_model_type.log_pdf( numpy.array(discrete_support), model_parameters)), c="blue", s=100, label="true pdf", alpha=1) # pylab.ylim([0,2]) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", s=100, label="predictive probability", alpha=1) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \ % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_single.png" pylab.savefig(filename) pylab.close() return p
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p