def KL_divergence(component_model_class, parameters_list, component_weights,
    M_c, X_L, X_D, n_samples=1000, true_log_pdf=None, support=None):
    """ FIXME: Add doc
    """

    # FIXME: Add validation code

    cctype = component_model_class.cctype

    # get support (X)
    if support is None:
        support = get_mixture_support(cctype, component_model_class, parameters_list, 
                nbins=n_samples, support=.995)
    elif not isinstance(support, numpy.ndarray):
        raise TypeError("support must be a numpy array (vector)")

    # get true pdf
    if true_log_pdf is None:
        true_log_pdf = get_mixture_pdf(support, component_model_class, parameters_list,
                    component_weights)
    elif not isinstance(true_log_pdf, numpy.ndarray):
        raise TypeError("true_log_pdf should be a numpy array (vector)")

    row = len(X_D[0])
    Q = [ (row,0,x) for x in support ]

    # get predictive probabilities
    pred_probs = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)

    kld = KL_divergence_arrays(support, pred_probs, true_log_pdf,
            is_discrete[cctype])

    return float(kld)
Exemple #2
0
    def simple_predictive_probability(self,
                                      M_c,
                                      X_L,
                                      X_D,
                                      Y,
                                      Q,
                                      epsilon=0.001):
        """Calculate the probability of a cell taking a value within epsilon of 
        the specified values given a latent state

        :param M_c: The column metadata
        :type M_c: dict
        :param X_L: the latent variables associated with the latent state
        :type X_L: dict
        :param X_D: the particular cluster assignments of each row in each view
        :type X_D: list of lists
        :param Y: A list of constraints to apply when sampling.  Each constraint
                  is a triplet of (r, d, v): r is the row index, d is the column
                  index and v is the value of the constraint
        :type Y: list of lists
        :param Q: A list of values to sample.  Each value is doublet of (r, d):
                  r is the row index, d is the column index
        :type Q: list of lists
        :param epsilon: the window around the specified value to take the delta
                        in cdf of
        :type epsilon: float
        :returns: list of floats -- probabilities of the values specified by Q

        """
        return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q, epsilon)
Exemple #3
0
def plot(results, filename=None):
    n_samples = results['config']['n_samples']
    samples = sorted(results['samples'])
    conf = results['conf']
    X_L = results['X_L_list'][0]
    X_D = results['X_D_list'][0]

    hgrm, _ = np.histogram(X_D[0], len(set(X_D[0])))
    max_mass_mode = np.argmax(hgrm)
    suffstats = X_L['view_state'][0]['column_component_suffstats'][0][max_mass_mode]

    counts = suffstats['N']
    sum_x = suffstats['sum_x']
    sum_x_sq = suffstats['sum_x_squared']
    scale = counts/results['config']['n_samples']
    component_model = ccm.p_ContinuousComponentModel(
        X_L['column_hypers'][0], counts, sum_x, sum_x_sq)

    plt.figure(facecolor='white')

    ax = plt.subplot(1, 2, 1)
    ax.hist(samples, min(31, int(n_samples/10)), normed=True, label='Samples',
            ec='none', fc='gray')
    T = [[x] for x in samples]
    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'])

    xvals = np.linspace(np.min(samples), np.max(samples), 300)
    Q = [(n_samples, 0, x) for i, x in enumerate(xvals)]
    p = [su.simple_predictive_probability(M_c, X_L, X_D, [], [q]) for q in Q]
    p = np.array(p)
    ax.plot(xvals, np.exp(p), c='#bbbbbb',
            label='Predicitive probability', lw=3)
    p = [component_model.calc_element_predictive_logp(x) for x in xvals]
    ax.plot(xvals, np.exp(p)*scale, c='#222222', label='Summary mode',
            lw=3)
    plt.xlabel('Samples')
    plt.legend(loc=0)

    ax = plt.subplot(1, 2, 2)
    ax.bar([0, 1], [conf, 1.0-conf], fc='#333333', ec='none')
    ax.set_ylim([0, 1])
    ax.set_xlim([-.25, 2])
    ax.set_xticks([.5, 1.5])
    plt.ylabel('Probability mass')
    ax.set_xticklabels(['Summary mode', 'All other modes'])

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
Exemple #4
0
    def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q):
        """Calculate probability of a cell taking a value given a latent state.

        :param Y: A list of constraints to apply when querying.  Each constraint
            is a triplet of (r, d, v): r is the row index, d is the column
            index and v is the value of the constraint
        :type Y: list of lists
        :param Q: A list of values to query.  Each value is triplet of (r, d, v):
            r is the row index, d is the column index, and v is the value at
            which the density is evaluated.
        :type Q: list of lists

        :returns: list of floats -- probabilities of the values specified by Q
        """
        return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)
Exemple #5
0
def KL_divergence(component_model_class,
                  parameters_list,
                  component_weights,
                  M_c,
                  X_L,
                  X_D,
                  n_samples=1000,
                  true_log_pdf=None,
                  support=None):
    """ FIXME: Add doc
    """

    # FIXME: Add validation code

    cctype = component_model_class.cctype

    # get support (X)
    if support is None:
        support = get_mixture_support(cctype,
                                      component_model_class,
                                      parameters_list,
                                      nbins=n_samples,
                                      support=.995)
    elif not isinstance(support, numpy.ndarray):
        raise TypeError("support must be a numpy array (vector)")

    # get true pdf
    if true_log_pdf is None:
        true_log_pdf = get_mixture_pdf(support, component_model_class,
                                       parameters_list, component_weights)
    elif not isinstance(true_log_pdf, numpy.ndarray):
        raise TypeError("true_log_pdf should be a numpy array (vector)")

    row = len(X_D[0])
    Q = [(row, 0, x) for x in support]

    # get predictive probabilities
    pred_probs = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q),
                                                  Q)

    kld = KL_divergence_arrays(support, pred_probs, true_log_pdf,
                               is_discrete[cctype])

    return float(kld)
Exemple #6
0
    def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q):
        """Calculate the probability of a cell taking a value given a latent state

        :param M_c: The column metadata
        :type M_c: dict
        :param X_L: the latent variables associated with the latent state
        :type X_L: dict
        :param X_D: the particular cluster assignments of each row in each view
        :type X_D: list of lists
        :param Y: A list of constraints to apply when sampling.  Each constraint
                  is a triplet of (r, d, v): r is the row index, d is the column
                  index and v is the value of the constraint
        :type Y: list of lists
        :param Q: A list of values to sample.  Each value is doublet of (r, d):
                  r is the row index, d is the column index
        :type Q: list of lists
        :returns: list of floats -- probabilities of the values specified by Q

        """
        return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)
Exemple #7
0
    def simple_predictive_probability(self, M_c, X_L, X_D, Y, Q):
        """Calculate the probability of a cell taking a value given a latent state

        :param M_c: The column metadata
        :type M_c: dict
        :param X_L: the latent variables associated with the latent state
        :type X_L: dict
        :param X_D: the particular cluster assignments of each row in each view
        :type X_D: list of lists
        :param Y: A list of constraints to apply when querying.  Each constraint
                  is a triplet of (r, d, v): r is the row index, d is the column
                  index and v is the value of the constraint
        :type Y: list of lists
        :param Q: A list of values to query.  Each value is triplet of (r, d, v):
                  r is the row index, d is the column index, and v is the value at
                  which the density is evaluated.
        :type Q: list of lists
        :returns: list of floats -- probabilities of the values specified by Q

        """
        return su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)
Exemple #8
0
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows,
                                                1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []  # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = []
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1
    # calculated using the trapezoid rule
    area_density = 0
    for i in range(len(X) - 1):
        area_density += (X[i + 1] - X[i]) * (densities[i + 1] +
                                             densities[i]) / 2.0

    print("Area of PDF (should be close to, but not greater than, 1): " +
          str(area_density))
    print(
        "*Note: The area will be less than one because the range (integral) is truncated."
    )

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,
                                    100,
                                    normed=1,
                                    histtype='stepfilled',
                                    label='samples',
                                    alpha=.5,
                                    color=[.5, .5, .5])
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()
    fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
Exemple #9
0
def run_test_multinomial(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = generate_multinomial_data(get_next_seed(), 2, n_rows, 1)

    state = State.p_State(M_c, T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)
    X_array = numpy.sort(numpy.array(samples))
    X = numpy.unique(X_array)
    X = X.tolist()

    # build the queries
    Qs = []
    for x in X:
        # Qtmp = (query_row, query_column, x[0])
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    print("Sum of densities (should be 1): %f" % (numpy.sum(densities)))

    pylab.clf()

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    mbins = numpy.unique(X_array)

    mbins = numpy.append(mbins, max(mbins) + 1)

    pdf, bins = numpy.histogram(X_array, mbins)

    pdf = pdf / float(numpy.sum(pdf))
    pylab.bar(mbins[0:-1], pdf, label="samples", alpha=.5)
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    fd, fig_filename = tempfile.mkstemp(prefix='run_test_multinomial_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
def check_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250
    
    get_next_seed = lambda : random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]
    
    X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed())
    
    hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0]
    
    component_model = component_model_type.from_data(X, hyperparameters)
    
    model_parameters = component_model.sample_parameters_given_hyper()
    
    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])
    
    state = State.p_State(M_c, T)
    
    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1)
    
    # get support
    discrete_support = component_model_type.generate_discrete_support(model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,)
    
    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data')
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), 
            model_parameters)), 
            c="blue", 
            s=100, 
            label="true pdf", 
            alpha=1)

        # pylab.ylim([0,2])
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            s=100, 
            label="predictive probability", 
            alpha=1)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
# move stuff around a little bit
for i in range(100):
	p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments'])

# quick test just to make sure things output what they're supposed to 
x = 0.0;
query_row = len(row[0]) # tests unobserved
# query_row = 3;		# tests observed
Q = [(query_row,0,x)]


Y = [] # no contraints
# Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints

p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)

n = 1000;
samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)

X = [sample[0] for sample in samples]

pylab.figure(facecolor='white')
pdf, bins, patches = pylab.hist(X,50,normed=True, histtype='bar',label='samples',edgecolor='none')
pylab.show()

pdf_max = max(pdf)

Qs = [];
for i in range(n):
    Qtmp = (query_row,0,X[i])
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None):
    """

    """
    random.seed(seed)

    N = 1000
    separation = .9
    
    get_next_seed = lambda : random.randrange(2147483647)

    cluster_weights = [[1.0/float(num_clusters)]*num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights,
                        [separation], seed=get_next_seed(),
                        distargs=[distargs[cctype]],
                        return_structure=True)

    T = numpy.array(T)
    T_list = T
    
    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])
    
    state = State.p_State(M_c, T_list)
    
    # transitions
    state.transition(n_steps=200)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0],
                            seed=get_next_seed()).flatten(1)
    
    # Get support over all component models
    discrete_support = qtu.get_mixture_support(cctype, component_model_type,
                         structure['component_params'][0], nbins=500)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)
    
    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, 
                structure['component_params'][0], [1.0/num_clusters]*num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1)
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(lpdf), 
            c="blue", 
            edgecolor="none",
            s=100, 
            label="true pdf", 
            alpha=1,
            zorder=3)
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            edgecolor="none",
            s=100, 
            label="predictive probability", 
            alpha=1,
            zorder=4)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        pylab.show()

    return p
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
    	query_row = 10
    else:
    	query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = [] # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = [];
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1 
    # calculated using the trapezoid rule
    area_density = 0;
    for i in range(len(X)-1):
    	area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0

    print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density)
    print "*Note: The area will be less than one because the range (integral) is truncated."

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5])
    pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left',fontsize='x-small')
    pylab.xlabel('value') 
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    raw_input("Press Enter when finished...")
def run_test_multinomial(n, observed):
	n_rows = 40
	n_cols = 40

	if observed:
		query_row = 10
	else:
		query_row = n_rows

	query_column = 1

	Q = [(query_row, query_column)]

	# do the test with multinomial data
	T, M_r, M_c = generate_multinomial_data(get_next_seed(),2,n_rows,1)
	
	state = State.p_State(M_c, T)

	X_L = state.get_X_L()
	X_D = state.get_X_D()

	Y = []

	# pull n samples
	samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)
	X_array = numpy.sort(numpy.array(samples))
	X = numpy.unique(X_array)
	X = X.tolist()

	# build the queries
	Qs = [];
	for x in X:
	    # Qtmp = (query_row, query_column, x[0])
	    Qtmp = (query_row, query_column, x)
	    Qs.append(Qtmp)

	# get pdf values
	densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

	print "Sum of densities (should be 1): %f" % (numpy.sum(densities))

	pylab.clf()

	# PLOT: probability vs samples distribution
	# scale all histograms to be valid PDFs (area=1)
	mbins = numpy.unique(X_array)

	mbins = numpy.append(mbins,max(mbins)+1)

	pdf, bins = numpy.histogram(X_array,mbins)

	pdf = pdf/float(numpy.sum(pdf))
	pylab.bar(mbins[0:-1],pdf,label="samples",alpha=.5)
	pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

	pylab.legend(loc='upper left',fontsize='x-small')
	pylab.xlabel('value') 
	pylab.ylabel('frequency/density')
	pylab.title('TEST: PDF (not scaled)')

	pylab.show()

	raw_input("Press Enter when finished...")
Exemple #15
0
# move stuff around a little bit
for i in range(100):
    p_State.transition(which_transitions=[
        'column_partition_assignments', 'row_partition_assignments'
    ])

# quick test just to make sure things output what they're supposed to
x = 0.0
query_row = len(row[0])  # tests unobserved
# query_row = 3;		# tests observed
Q = [(query_row, 0, x)]

Y = []  # no contraints
# Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints

p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)

n = 1000
samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n)

X = [sample[0] for sample in samples]

pylab.figure(facecolor='white')
pdf, bins, patches = pylab.hist(X,
                                50,
                                normed=True,
                                histtype='bar',
                                label='samples',
                                edgecolor='none')
pylab.show()
def test_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250

    get_next_seed = lambda: random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]

    X = component_model_type.generate_data_from_parameters(
        data_params, N, gen_seed=get_next_seed())

    hyperparameters = component_model_type.draw_hyperparameters(X)[0]

    component_model = component_model_type.from_data(X, hyperparameters)

    model_parameters = component_model.sample_parameters_given_hyper()

    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(
        model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])

    state = State.p_State(M_c, T)

    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(
        su.simple_predictive_sample(M_c,
                                    X_L,
                                    X_D, [], [(N, 0)],
                                    get_next_seed,
                                    n=N)).flatten(1)

    # get support
    discrete_support = component_model_type.generate_discrete_support(
        model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(
        M_c,
        X_L,
        X_D,
        [] * len(Q),
        Q,
    )

    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ = numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(20, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data')
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(
                          component_model_type.log_pdf(
                              numpy.array(discrete_support),
                              model_parameters)),
                      c="blue",
                      s=100,
                      label="true pdf",
                      alpha=1)

        # pylab.ylim([0,2])

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      s=100,
                      label="predictive probability",
                      alpha=1)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def test_one_feature_mixture(component_model_type,
                             num_clusters=3,
                             show_plot=False,
                             seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p