Example #1
0
def GenDataFromPartitions(col_part, row_parts, mean_gen, std_gen, std_data,
                          seed):
    n_cols = len(col_part)
    n_rows = row_parts.shape[1]

    rng = np.random.RandomState(seed)

    T = np.zeros((n_rows, n_cols))

    for col in range(n_cols):
        view = col_part[col]
        row_part = row_parts[view, :]
        cats = max(row_part) + 1
        for cat in range(cats):
            row_dex = np.nonzero(row_part == cat)[0]
            n_rows_cat = len(row_dex)
            mean = rng.normal(mean_gen, std_gen)
            X = rng.normal(mean, std_data, (n_rows_cat, 1))
            i = 0
            for row in row_dex:
                T[row, col] = X[i]
                i += 1

    T = T.tolist()
    M_r = du.gen_M_r_from_T(T)
    M_c = du.gen_M_c_from_T(T)

    return T, M_r, M_c
def generate_multinomial_data(next_seed,n_cols,n_rows,n_views):
	# generate the partitions
	random.seed(next_seed)
	
	cols_to_views = [0 for _ in range(n_cols)]
	rows_in_views_to_cols = []
	for view in range(n_views):
		partition = eu.CRP(n_rows,2.0)
		random.shuffle(partition)
		rows_in_views_to_cols.append(partition)

	# generate the data
	data = numpy.zeros((n_rows,n_cols),dtype=float)
	for col in range(n_cols):
		view = cols_to_views[col]
		for row in range(n_rows):
			cluster = rows_in_views_to_cols[view][row]
			data[row,col] = cluster

	T = data.tolist()
	M_r = du.gen_M_r_from_T(T)
	M_c = du.gen_M_c_from_T(T)

	T, M_c = du.convert_columns_to_multinomial(T, M_c, range(n_cols))

	return T, M_r, M_c
Example #3
0
def GenDataFromPartitions(col_part,row_parts,mean_gen,std_gen,std_data):
	n_cols = len(col_part)
	n_rows = row_parts.shape[1]

	seed = int(time()*100)
	np.random.seed(seed)

	T = np.zeros((n_rows,n_cols))

	for col in range(n_cols):
		view = col_part[col]
		row_part = row_parts[view,:]
		cats = max(row_part)+1
		for cat in range(cats):
			row_dex = np.nonzero(row_part==cat)[0]
			n_rows_cat = len(row_dex)
			mean = np.random.normal(mean_gen,std_gen)
			X = np.random.normal(mean,std_data,(n_rows_cat,1))
			i = 0
			for row in row_dex:
				T[row,col] = X[i]
				i += 1

	
	T = T.tolist()
	M_r = du.gen_M_r_from_T(T)
	M_c = du.gen_M_c_from_T(T)

	return T, M_r, M_c
Example #4
0
def generate_multinomial_data(next_seed, n_cols, n_rows, n_views):
    # generate the partitions
    random.seed(next_seed)

    cols_to_views = [0 for _ in range(n_cols)]
    rows_in_views_to_cols = []
    for view in range(n_views):
        partition = eu.CRP(n_rows, 2.0)
        random.shuffle(partition)
        rows_in_views_to_cols.append(partition)

    # generate the data
    data = numpy.zeros((n_rows, n_cols), dtype=float)
    for col in range(n_cols):
        view = cols_to_views[col]
        for row in range(n_rows):
            cluster = rows_in_views_to_cols[view][row]
            data[row, col] = cluster

    T = data.tolist()
    M_r = du.gen_M_r_from_T(T)
    M_c = du.gen_M_c_from_T(T)

    T, M_c = du.convert_columns_to_multinomial(T, M_c, list(range(n_cols)))

    return T, M_r, M_c
def do_test(which_plot,
            max_plots,
            n,
            burn_in,
            cc_samples,
            which_test,
            correlation=0,
            do_plot=False):
    if which_test is "correlated":
        X = correlated(correlation, n=n)
    elif which_test is "square":
        X = square(n=n)
    elif which_test is "ring":
        X = ring(n=n)
    elif which_test is "circle":
        X = circle(n=n)
    elif which_test is "diamond":
        X = diamond(n=n)
    elif which_test is "blob":
        X = correlated(0.0, n=n)
    elif which_test is "dots":
        X = four_dots(n=n)
    elif which_test is "mixed":
        X = numpy.vstack((correlated(.95, n=n / 2), correlated(0, n=n / 2)))

    get_next_seed = lambda: random.randrange(32000)

    # Build a state
    M_c = du.gen_M_c_from_T(X.tolist())
    state = State.p_State(M_c, X.tolist())
    X_Ls = []
    X_Ds = []

    # collect crosscat samples
    for _ in range(cc_samples):
        state = State.p_State(M_c, X.tolist())
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    SX = sample_data_from_crosscat(M_c, X_Ls, X_Ds, get_next_seed, n)

    if do_plot:
        pl.subplot(2, max_plots, which_plot)
        pl.scatter(X[:, 0], X[:, 1], c='blue', alpha=.5)
        pl.title("Original data")
        pl.subplot(2, max_plots, max_plots + which_plot)
        pl.scatter(SX[:, 0], SX[:, 1], c='red', alpha=.5)
        pl.title("Sampled data")
        pl.show

    return M_c, X_Ls, X_Ds
Example #6
0
def plot(results, filename=None):
    n_samples = results['config']['n_samples']
    samples = sorted(results['samples'])
    conf = results['conf']
    X_L = results['X_L_list'][0]
    X_D = results['X_D_list'][0]

    hgrm, _ = np.histogram(X_D[0], len(set(X_D[0])))
    max_mass_mode = np.argmax(hgrm)
    suffstats = X_L['view_state'][0]['column_component_suffstats'][0][max_mass_mode]

    counts = suffstats['N']
    sum_x = suffstats['sum_x']
    sum_x_sq = suffstats['sum_x_squared']
    scale = counts/results['config']['n_samples']
    component_model = ccm.p_ContinuousComponentModel(
        X_L['column_hypers'][0], counts, sum_x, sum_x_sq)

    plt.figure(facecolor='white')

    ax = plt.subplot(1, 2, 1)
    ax.hist(samples, min(31, int(n_samples/10)), normed=True, label='Samples',
            ec='none', fc='gray')
    T = [[x] for x in samples]
    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'])

    xvals = np.linspace(np.min(samples), np.max(samples), 300)
    Q = [(n_samples, 0, x) for i, x in enumerate(xvals)]
    p = [su.simple_predictive_probability(M_c, X_L, X_D, [], [q]) for q in Q]
    p = np.array(p)
    ax.plot(xvals, np.exp(p), c='#bbbbbb',
            label='Predicitive probability', lw=3)
    p = [component_model.calc_element_predictive_logp(x) for x in xvals]
    ax.plot(xvals, np.exp(p)*scale, c='#222222', label='Summary mode',
            lw=3)
    plt.xlabel('Samples')
    plt.legend(loc=0)

    ax = plt.subplot(1, 2, 2)
    ax.bar([0, 1], [conf, 1.0-conf], fc='#333333', ec='none')
    ax.set_ylim([0, 1])
    ax.set_xlim([-.25, 2])
    ax.set_xticks([.5, 1.5])
    plt.ylabel('Probability mass')
    ax.set_xticklabels(['Summary mode', 'All other modes'])

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
Example #7
0
def gen_data_crosscat(mode, T):
    # edit transition list according to

    all_transitions = []

    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'] * 2)

    state = State.p_state(M_c, T)
    if mode == 'crp_mixture':
        # fix the views
        X_D = state.get_X_D()
        X_L = state.get_X_L()
        X_D = [X_D[0]]
        X_L['column_partition']['assignments'] = [1, 1]
        state = State.p_state(M_c, T, X_L=X_L, X_D=X_D)
def gen_data_crosscat(mode, T):
    # edit transition list according to 
    
    all_transitions = []

    M_c = du.gen_M_c_from_T(T, cctypes=['continuous']*2)

    state = State.p_state(M_c, T)
    if mode == 'crp_mixture':
        # fix the views
        X_D = state.get_X_D();
        X_L = state.get_X_L();
        X_D = [X_D[0]]
        X_L['column_partition']['assignments'] = [1,1]
        state = State.p_state(M_c, T, X_L=X_L, X_D=X_D)
Example #9
0
def do_test(which_plot, max_plots, n, burn_in, cc_samples, which_test, correlation=0, do_plot=False):
    if which_test is "correlated":
        X = correlated(correlation, n=n)
    elif which_test is "square":
        X = square(n=n)
    elif which_test is "ring":
        X = ring(n=n)
    elif which_test is "circle":
        X = circle(n=n)
    elif which_test is "diamond":
        X = diamond(n=n)
    elif which_test is "blob":
        X = correlated(0.0, n=n)
    elif which_test is "dots":
        X = four_dots(n=n)
    elif which_test is "mixed":
        X = numpy.vstack((correlated(0.95, n=n / 2), correlated(0, n=n / 2)))

    get_next_seed = lambda: random.randrange(32000)

    # Build a state
    M_c = du.gen_M_c_from_T(X.tolist())
    state = State.p_State(M_c, X.tolist())
    X_Ls = []
    X_Ds = []

    # collect crosscat samples
    for _ in range(cc_samples):
        state = State.p_State(M_c, X.tolist())
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    SX = sample_data_from_crosscat(M_c, X_Ls, X_Ds, get_next_seed, n)

    if do_plot:
        pl.subplot(2, max_plots, which_plot)
        pl.scatter(X[:, 0], X[:, 1], c="blue", alpha=0.5)
        pl.title("Original data")
        pl.subplot(2, max_plots, max_plots + which_plot)
        pl.scatter(SX[:, 0], SX[:, 1], c="red", alpha=0.5)
        pl.title("Sampled data")
        pl.show

    return M_c, X_Ls, X_Ds
Example #10
0
pl.figure()
burn_in = 400

mi_ests = numpy.zeros(len(widths))

datas = []

nr = 0
for w in widths:
    T, mi_est = gen_ring(n, w, SEED=get_next_seed())

    datas.append(T)

    print "num_samples: %i, width: %f" % (n, w)

    M_c = du.gen_M_c_from_T(T, cctypes)
    X_Ls = []
    X_Ds = []

    for ns in range(n_samples):
        state = State.p_State(M_c, T)
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    MI, Linfoot = iu.mutual_information(M_c,
                                        X_Ls,
                                        X_Ds, [(0, 1)],
                                        n_samples=5000)

    data_d = numpy.transpose(MI)
def check_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250
    
    get_next_seed = lambda : random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]
    
    X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed())
    
    hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0]
    
    component_model = component_model_type.from_data(X, hyperparameters)
    
    model_parameters = component_model.sample_parameters_given_hyper()
    
    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])
    
    state = State.p_State(M_c, T)
    
    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1)
    
    # get support
    discrete_support = component_model_type.generate_discrete_support(model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,)
    
    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data')
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), 
            model_parameters)), 
            c="blue", 
            s=100, 
            label="true pdf", 
            alpha=1)

        # pylab.ylim([0,2])
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            s=100, 
            label="predictive probability", 
            alpha=1)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None):
    """

    """
    random.seed(seed)

    N = 1000
    separation = .9
    
    get_next_seed = lambda : random.randrange(2147483647)

    cluster_weights = [[1.0/float(num_clusters)]*num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights,
                        [separation], seed=get_next_seed(),
                        distargs=[distargs[cctype]],
                        return_structure=True)

    T = numpy.array(T)
    T_list = T
    
    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])
    
    state = State.p_State(M_c, T_list)
    
    # transitions
    state.transition(n_steps=200)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0],
                            seed=get_next_seed()).flatten(1)
    
    # Get support over all component models
    discrete_support = qtu.get_mixture_support(cctype, component_model_type,
                         structure['component_params'][0], nbins=500)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)
    
    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, 
                structure['component_params'][0], [1.0/num_clusters]*num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1)
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(lpdf), 
            c="blue", 
            edgecolor="none",
            s=100, 
            label="true pdf", 
            alpha=1,
            zorder=3)
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            edgecolor="none",
            s=100, 
            label="predictive probability", 
            alpha=1,
            zorder=4)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        pylab.show()

    return p
Example #13
0
# create the data
if True:
    T, M_r, M_c = du.gen_factorial_data_objects(
        gen_seed, num_clusters,
        num_cols, num_rows, num_splits,
        max_mean=max_mean, max_std=max_std,
        )
else:
    with open('SynData2.csv') as fh:
        import numpy
        import csv
        T = numpy.array([
                row for row in csv.reader(fh)
                ], dtype=float).tolist()
        M_r = du.gen_M_r_from_T(T)
        M_c = du.gen_M_c_from_T(T)


# create the state
p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed)
p_State.plot_T(filename='T')

# transition the sampler
print("p_State.get_marginal_logp():", p_State.get_marginal_logp())
for transition_idx in range(num_transitions):
    print("transition #: %s" % transition_idx)
    p_State.transition()
    counts = [
        view_state['row_partition_model']['counts']
        for view_state in p_State.get_X_L()['view_state']
        ]
Example #14
0
def continuous_imputation_confidence(samples,
                                     imputed,
                                     column_component_suffstats_i,
                                     n_steps=100,
                                     n_chains=1,
                                     return_metadata=False):
    # XXX: the confidence in continuous imputation is "the probability that
    # there exists a unimodal summary" which is defined as the proportion of
    # probability mass in the largest mode of a DPMM inferred from the simulate
    # samples. We use crosscat on the samples for a given number of iterations,
    # then calculate the proportion of mass in the largest mode.
    #
    # NOTE: The definition of confidence and its implementation do not agree.
    # The probability of a unimodal summary is P(k=1|X), where k is the number
    # of components in some infinite mixture model. I would describe the
    # current implementation as "Is there a mode with sufficient enough mass
    # that we can ignore the other modes". If this second formulation is to be
    # used, it means that we need to not use the median of all the samples as
    # the imputed value, but the median of the samples of the summary mode,
    # because the summary (the imputed value) should come from the summary
    # mode.
    #
    # There are a lot of problems with this second formulation.
    # 0. SLOW. Like, for real.
    # 1. Non-deterministic. The answer will be different given the same
    #   samples.
    # 2. Inaccurate. Approximate inference about approximate inferences.
    #   In practice confidences on the sample samples could be significantly
    #   different because the Gibbs sampler that underlies crosscat is
    #   susceptible to getting stuck in local maximum. Of course, this could be
    #   mitigated to some extent by using more chains, but things are slow
    #   enough as it is.
    # 3. Confidence (interval) has a distinct meaning to the people who will
    #   be using this software. A unimodal summary does not necessarily mean
    #   that inferences are within an acceptable range. We are going to need to
    #   be loud about this. Maybe there should be a notion of tolerance?
    #
    # An alternative: mutual predictive coverage
    # ------------------------------------------
    # Divide the number of samples in the intersection of the 90% CI's of each
    # component model by the number of samples in the union of the 90% CI's of
    # each component model.

    from crosscat.cython_code import State

    # XXX: assumes samples somes in as a 1-D numpy.array or 1-D list
    num_samples = float(len(samples))
    T = [[x] for x in samples]

    # XXX: This is a higly problematic consequence of the current definition of
    # confidence. If the number of samples is 1, then the confidence is always
    # 1 because there will be exactly 1 mode in the DPMM (recall the DPMM can
    # have, at maximum, as many modes at data points). I figure if we're going
    # to give a bad answer, we shoud give it quickly.
    if num_samples == 1:
        return 1.0

    confs = []
    tlist = [
        'column_hyperparameters', 'row_partition_hyperparameters',
        'row_partition_assignments'
    ]
    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'])

    if return_metadata:
        X_L_list = []
        X_D_list = []

    for _ in range(n_chains):
        ccstate = State.p_State(M_c, T)
        ccstate.transition(which_transitions=tlist, n_steps=n_steps)

        X_D = ccstate.get_X_D()

        assignment = X_D[0]
        num_cats = max(assignment) + 1
        props = numpy.histogram(assignment, num_cats)[0] / num_samples
        confs.append(max(props))

        if return_metadata:
            X_L_list.append(ccstate.get_X_L())
            X_D_list.append(X_D)

    conf = numpy.mean(confs)
    if return_metadata:
        return conf, X_L_list, X_D_list
    else:
        return conf
def generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=0):
    #

    assert(num_clusters <= num_rows)
    assert(num_views <= num_cols)
    T = numpy.zeros((num_rows, num_cols))

    random.seed(seed)
    numpy.random.seed(seed=seed)
    get_next_seed = lambda : random.randrange(2147483647)

    # generate an assignment of columns to views (uniform)
    cols_to_views = range(num_views)
    view_counts = numpy.ones(num_views, dtype=int)
    for i in range(num_views, num_cols):
        r = random.randrange(num_views)
        cols_to_views.append(r)
        view_counts[r] += 1

    random.shuffle(cols_to_views)

    assert(len(cols_to_views) == num_cols)
    assert(max(cols_to_views) == num_views-1)

    # for each view, generate an assignment of rows to num_clusters
    row_to_clusters = []
    cluster_counts = []
    for view in range(num_views):
        row_to_cluster = range(num_clusters)
        cluster_counts_i = numpy.ones(num_clusters,dtype=int)
        for i in range(num_clusters, num_rows):
            r = random.randrange(num_clusters)
            row_to_cluster.append(r)
            cluster_counts_i[r] += 1

        random.shuffle(row_to_cluster)

        assert(len(row_to_cluster) == num_rows)
        assert(max(row_to_cluster) == num_clusters-1)

        row_to_clusters.append(row_to_cluster)
        cluster_counts.append(cluster_counts_i)

    assert(len(row_to_clusters) == num_views)

    # generate the correlated data
    for view in range(num_views):
        for cluster in range(num_clusters):
            cell_cols = view_counts[view]
            cell_rows = cluster_counts[view][cluster]
            means = numpy.random.uniform(-mean_range/2.0,mean_range/2.0,cell_cols)
            X =  generate_correlated_data(cell_rows, cell_cols, means, corr, seed=get_next_seed())
            # get the indices of the columns in this view
            col_indices = numpy.nonzero(numpy.array(cols_to_views)==view)[0]
            # get the indices of the rows in this view and this cluster
            row_indices = numpy.nonzero(numpy.array(row_to_clusters[view])==cluster)[0]
            # insert the data
            for col in range(cell_cols):
                for row in range(cell_rows):
                    r = row_indices[row]
                    c = col_indices[col]
                    T[r,c] = X[row,col]


    M_c = du.gen_M_c_from_T(T)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=get_next_seed())

    return  T, M_c, M_r, X_L, X_D, cols_to_views
def test_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250

    get_next_seed = lambda: random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]

    X = component_model_type.generate_data_from_parameters(
        data_params, N, gen_seed=get_next_seed())

    hyperparameters = component_model_type.draw_hyperparameters(X)[0]

    component_model = component_model_type.from_data(X, hyperparameters)

    model_parameters = component_model.sample_parameters_given_hyper()

    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(
        model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])

    state = State.p_State(M_c, T)

    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(
        su.simple_predictive_sample(M_c,
                                    X_L,
                                    X_D, [], [(N, 0)],
                                    get_next_seed,
                                    n=N)).flatten(1)

    # get support
    discrete_support = component_model_type.generate_discrete_support(
        model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(
        M_c,
        X_L,
        X_D,
        [] * len(Q),
        Q,
    )

    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ = numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(20, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data')
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(
                          component_model_type.log_pdf(
                              numpy.array(discrete_support),
                              model_parameters)),
                      c="blue",
                      s=100,
                      label="true pdf",
                      alpha=1)

        # pylab.ylim([0,2])

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      s=100,
                      label="predictive probability",
                      alpha=1)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
Example #17
0
def test_impute_vs_column_average_single(component_model_type,
                                         num_clusters,
                                         seed=0):
    """	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

    random.seed(seed)

    N = 100

    get_next_seed = lambda: random.randrange(2147483647)

    C = .9  # highly-separated clusters

    cctype = component_model_type.cctype

    component_model_parameters = sdg.generate_separated_model_parameters(
        cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype])

    # generte a partition of rows to clusters (evenly-weighted)
    Z = range(num_clusters)
    for z in range(N - num_clusters):
        Z.append(random.randrange(num_clusters))

    random.shuffle(Z)

    # generate the data
    T = numpy.array([[0]] * N, dtype=float)

    for x in range(N):
        z = Z[x]
        T[x] = component_model_type.generate_data_from_parameters(
            component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

    T_list = T.tolist()

    # intialize the state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T)

    # transitions
    state.transition(n_steps=100)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate a row from the sample
    T_generated = sdg.predictive_columns(M_c,
                                         X_L,
                                         X_D, [0],
                                         seed=get_next_seed())

    # generate a row of column averages
    T_colave = numpy.ones(T.shape) * numpy.mean(T)

    # get the mean squared error
    err_sample = numpy.mean((T_generated - T)**2.0)
    err_colave = numpy.mean((T_colave - T)**2.0)

    return err_sample, err_colave
def test_one_feature_mixture(component_model_type,
                             num_clusters=3,
                             show_plot=False,
                             seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def test_impute_vs_column_average_single(component_model_type, num_clusters, seed=0):
	"""	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

	random.seed(seed)

	N = 100

	get_next_seed = lambda : random.randrange(2147483647)

	C = .9 # highly-separated clusters

	cctype = component_model_type.cctype

	component_model_parameters = sdg.generate_separated_model_parameters(
						cctype, C, num_clusters, get_next_seed,
						distargs=distargs[cctype])

	# generte a partition of rows to clusters (evenly-weighted)
	Z = range(num_clusters)
	for z in range(N-num_clusters):
		Z.append(random.randrange(num_clusters))

	random.shuffle(Z)

	# generate the data
	T = numpy.array([[0]]*N, dtype=float)

	for x in range(N):
		z = Z[x]
		T[x] = component_model_type.generate_data_from_parameters(
				component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

	T_list = T.tolist()

	# intialize the state
	M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

	state = State.p_State(M_c, T)

	# transitions
	state.transition(n_steps=100)

	# get the sample
	X_L = state.get_X_L()
	X_D = state.get_X_D()

	# generate a row from the sample
	T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed())

	# generate a row of column averages
	T_colave = numpy.ones(T.shape)*numpy.mean(T)

	# get the mean squared error
	err_sample = numpy.mean( (T_generated-T)**2.0 )
	err_colave = numpy.mean( (T_colave-T)**2.0 )

	return err_sample, err_colave
Example #20
0
def run_test(args):

    rho = args.rho
    num_times = args.num_times
    min_num_rows = args.min_num_rows
    max_num_rows = args.max_num_rows
    n_grid = args.n_grid
    filename = args.filename
    discrete = args.discrete

    num_samples = []
    for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist():
        num_samples.append(int(ns))

    variances = []

    burn_in = 200

    MIs = numpy.zeros((num_times, len(num_samples)))

    mi_diff = numpy.zeros((len(num_samples), num_times))

    if not discrete:
        T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho)
        cctypes = ['continuous'] * 2
    else:
        T, true_mi, external_mi = gen_correlated_data_discrete(
            num_samples[-1], rho)
        cctypes = ['multinomial'] * 2

    data_subs = []

    n_index = 0
    for n in num_samples:
        T_sub = numpy.copy(T[0:n - 1, :])

        data = []

        data_subs.append(T_sub)

        print("%i: " % n)
        for t in range(num_times):
            M_c = du.gen_M_c_from_T(T_sub, cctypes)
            state = State.p_State(M_c, T_sub)
            state.transition(n_steps=burn_in)
            X_D = state.get_X_D()
            X_L = state.get_X_L()

            MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)],
                                                n_samples=5000)

            mi_diff[n_index, t] = true_mi - MI[0][0]

            print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0]))

            MIs[t, n_index] = MI[0][0]

        n_index += 1

    if discrete:
        dtype_str = "discrete"
    else:
        dtype_str = "continuous"

    basefilename = filename + str(int(time.time()))
    figname = basefilename + ".png"
    datname = basefilename + "_DATA.png"

    pl.figure

    # plot data
    # pl.subplot(1,2,1)
    pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4))
    i = 0
    for T_s in data_subs:
        pl.subplot(1, len(data_subs), i + 1)
        num_rows = num_samples[i]
        if discrete:
            heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0],
                                                        T_s[:, 1],
                                                        bins=10)
            extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
            pl.imshow(heatmap, extent=extent, interpolation="nearest")
        else:
            pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81)
        pl.title('#r: ' + str(num_rows))

        i += 1

    pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(datname)
    pl.clf()

    pl.figure(tight_layout=True, figsize=(5, 4))
    # plot convergence
    # pl.subplot(1,2,2)
    # standard deviation
    stderr = numpy.std(MIs, axis=0)  #/(float(num_times)**.5)
    mean = numpy.mean(MIs, axis=0)
    pl.errorbar(num_samples, mean, yerr=stderr, c='blue')
    pl.plot(num_samples, mean, c="blue", alpha=.8, label='mean MI')
    pl.plot(num_samples, [true_mi] * len(num_samples),
            color='red',
            alpha=.8,
            label='true MI')
    pl.plot(num_samples, [external_mi] * len(num_samples),
            color=(0, .5, .5),
            alpha=.8,
            label='external MI')
    pl.title('convergence')
    pl.xlabel('#rows in X (log)')
    pl.ylabel('CrossCat MI - true MI')

    pl.legend(loc=0, prop={'size': 8})
    pl.gca().set_xscale('log')

    # save output
    pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(figname)
Example #21
0
def gen_data(cctypes,
             n_rows,
             cols_to_views,
             cluster_weights,
             separation,
             seed=0,
             distargs=None,
             return_structure=False):
    """	Generates a synthetic data.
		Inputs:
			- cctypes: List of strings. Each entry, i, is the cctype of the 
			column i. ex: cctypes = ['continuous','continuous', 'multinomial']
			- n_rows: integer. the number of rows
			- cols_to_views: List of integers. Each entry, i, is the view, v, 
			to which columns i is assigned. v \in [0,...,n_cols-1].
			ex: cols_to_views = [0, 0, 1]
			- cluster_weights: List of lists of floats. A num_views length list
			of list. Each sublist, W, is a list of cluster weights for the 
			view, thus W should always sum to 1.
			ex (two views, first view has 2 clusters, second view has 3 
			clusters):
			cluster_weights = [[.3, .7], [.25, .5, .25]]
			- separation: list of floats. Each entry, i, is the separation, C,
			of the clusters in view i. C \in [0,1] where 0 is no separation and
			1 is well-separated.
			ex (2 views): separation = [ .5, .7]
			- seed: optional
			- distargs: optional (only if continuous). distargs is n_columns
			length list where each entry is either None or a dict appropriate 
			for the cctype in that column. For a normal feature, the entry 
			should be None, for a multinomial feature, the entry should be a 
			dict with the entry K (the number of categories). 
			- return_structure: (bool, optional). Returns also a dict withe the
			data generation structure included. A dict with keys:
				- component_params:  a n_cols length list of lists. Where each 
				list is a set of component model parameters for each cluster in
				the view to which that column belongs
				- cols_to_views: a list assigning each column to a view
				- rows_to_clusters: a n_views length list of list. Each entry,
				rows_to_clusters[v][r] is the cluster to which all rows in 
				columns belonging to view v are assigned
		Returns:
			T, M_c
		Example:
			>>> cctypes = ['continuous','continuous','multinomial','continuous','multinomial']
			>>> disargs = [None, None, dict(K=5), None, dict(K=2)]
			>>> n_rows = 10
			>>> cols_to_views = [0, 0, 1, 1, 2]
			>>> cluster_weights = [[.3, .7],[.5, .5],[.2, .3, .5]]
			>>> separation = [.9, .6, .9]
			>>> T, M_c = gen_data(cctypes, n_rows, cols_to_views, cluster_weights,
				separation, seed=0, distargs=distargs)
	"""

    # check Inputs
    if not isinstance(n_rows, int):
        raise TypeError("n_rows should be an integer")

    if not isinstance(cctypes, list):
        raise TypeError("cctypes should be a list")

    n_cols_cctypes = len(cctypes)
    for cctype in cctypes:
        if not isinstance(cctype, str):
            raise TypeError("cctypes should be a list of strings")

        # NOTE: will have to update when new component models are added
        if cctype not in ['continuous', 'multinomial', 'cyclic']:
            raise ValueError("invalid cctypein cctypes: %s." % cctype)

    if not isinstance(cols_to_views, list):
        raise TypeError("cols_to_views should be a list")

    if len(cols_to_views) != n_cols_cctypes:
        raise ValueError("number of columns in cctypes does not match number\
		 of columns in cols_to_views")

    if min(cols_to_views) != 0:
        raise ValueError("min value of cols_to_views should be 0")

    n_views_cols_to_views = max(cols_to_views) + 1

    set_ctv = set(cols_to_views)
    if len(set_ctv) != n_views_cols_to_views:
        raise ValueError("View indices skipped in cols_to_views")

    # check cluster weights
    if not isinstance(cluster_weights, list):
        raise TypeError("cluster_weights should be a list")

    if n_views_cols_to_views != len(cluster_weights):
        raise ValueError("The number of views in cols_to_views and \
			cluster_weights do not agree.")

    # check each set of weights
    for W in cluster_weights:
        if not isinstance(W, list):
            raise TypeError("cluster_weights should be a list of lists")
        if math.fabs(sum(W) - 1.0) > .0000001:
            raise ValueError("each vector of weights should sum to 1")

    if not isinstance(separation, list):
        raise TypeError("separation should be a list")

    if len(separation) != n_views_cols_to_views:
        raise ValueError(
            "number of view in separation and cols_to_views do not agree")

    for c in separation:
        if not isinstance(c, float) or c > 1.0 or c < 0.0:
            raise ValueError(
                "each value in separation should be a float from 0 to 1")

    num_views = len(separation)
    n_cols = len(cols_to_views)

    # check the cctypes vs the distargs
    if distargs is None:
        distargs = [None for i in range(n_cols)]

    if not isinstance(distargs, list):
        raise TypeError("distargs should be a list")

    if len(distargs) != n_cols:
        raise ValueError("distargs should have an entry for each column")

    for i in range(n_cols):
        if cctypes[i] == 'continuous' or cctypes[i] == 'cyclic':
            if distargs[i] is not None:
                raise ValueError(
                    "distargs entry for 'continuous' cctype should be None")
        elif cctypes[i] == 'multinomial':
            if not isinstance(distargs[i], dict):
                raise TypeError(
                    "ditargs for cctype 'multinomial' should be a dict")
            if len(distargs[i].keys()) != 1:
                raise KeyError(
                    "distargs for cctype 'multinomial' should have one key, 'K'"
                )
            if 'K' not in distargs[i].keys():
                raise KeyError(
                    "distargs for cctype 'multinomial' should have the key 'K'"
                )
        else:
            raise ValueError("invalid cctypein cctypes: %s." % cctypes[i])

    random.seed(seed)
    numpy.random.seed(seed)

    # Generate the rows to categories partitions (mutlinomial)
    rows_to_clusters = []
    for W in cluster_weights:

        cW = list(W)
        for i in range(1, len(cW)):
            cW[i] += cW[i - 1]

        K = len(cW)

        rows_to_clusters_view = list(range(K))
        for r in range(K, n_rows):
            rows_to_clusters_view.append(p_draw(cW))

        random.shuffle(rows_to_clusters_view)
        assert len(rows_to_clusters_view) == n_rows

        rows_to_clusters.append(rows_to_clusters_view)

    get_next_seed = lambda: random.randrange(2147483647)

    # start generating the data
    data_table = numpy.zeros((n_rows, n_cols))
    component_params = []
    for col in range(n_cols):

        view = cols_to_views[col]

        # get the number of cluster in view
        num_clusters = len(cluster_weights[view])

        cctype = cctypes[col]

        C = separation[view]

        # generate a set of C-separated component model parameters
        component_parameters = generate_separated_model_parameters(
            cctype, C, num_clusters, get_next_seed, distargs=distargs[col])

        component_params.append(component_parameters)

        # get the data generation function
        gen = get_data_generator[cctype]
        for row in range(n_rows):
            # get the cluster this
            cluster = rows_to_clusters[view][row]
            params = component_parameters[cluster]
            x = gen(params, 1, gen_seed=get_next_seed())[0]
            data_table[row, col] = x

    T = data_table.tolist()
    M_c = du.gen_M_c_from_T(T, cctypes=cctypes)

    if return_structure:
        structure = dict()
        structure['component_params'] = component_params
        structure['cols_to_views'] = cols_to_views
        structure['rows_to_clusters'] = rows_to_clusters
        structure['cluster_weights'] = cluster_weights
        return T, M_c, structure
    else:
        return T, M_c
def generate_correlated_state(num_rows,
                              num_cols,
                              num_views,
                              num_clusters,
                              mean_range,
                              corr,
                              seed=0):
    #

    assert (num_clusters <= num_rows)
    assert (num_views <= num_cols)
    T = numpy.zeros((num_rows, num_cols))

    random.seed(seed)
    numpy.random.seed(seed=seed)
    get_next_seed = lambda: random.randrange(2147483647)

    # generate an assignment of columns to views (uniform)
    cols_to_views = range(num_views)
    view_counts = numpy.ones(num_views, dtype=int)
    for i in range(num_views, num_cols):
        r = random.randrange(num_views)
        cols_to_views.append(r)
        view_counts[r] += 1

    random.shuffle(cols_to_views)

    assert (len(cols_to_views) == num_cols)
    assert (max(cols_to_views) == num_views - 1)

    # for each view, generate an assignment of rows to num_clusters
    row_to_clusters = []
    cluster_counts = []
    for view in range(num_views):
        row_to_cluster = range(num_clusters)
        cluster_counts_i = numpy.ones(num_clusters, dtype=int)
        for i in range(num_clusters, num_rows):
            r = random.randrange(num_clusters)
            row_to_cluster.append(r)
            cluster_counts_i[r] += 1

        random.shuffle(row_to_cluster)

        assert (len(row_to_cluster) == num_rows)
        assert (max(row_to_cluster) == num_clusters - 1)

        row_to_clusters.append(row_to_cluster)
        cluster_counts.append(cluster_counts_i)

    assert (len(row_to_clusters) == num_views)

    # generate the correlated data
    for view in range(num_views):
        for cluster in range(num_clusters):
            cell_cols = view_counts[view]
            cell_rows = cluster_counts[view][cluster]
            means = numpy.random.uniform(-mean_range / 2.0, mean_range / 2.0,
                                         cell_cols)
            X = generate_correlated_data(cell_rows,
                                         cell_cols,
                                         means,
                                         corr,
                                         seed=get_next_seed())
            # get the indices of the columns in this view
            col_indices = numpy.nonzero(numpy.array(cols_to_views) == view)[0]
            # get the indices of the rows in this view and this cluster
            row_indices = numpy.nonzero(
                numpy.array(row_to_clusters[view]) == cluster)[0]
            # insert the data
            for col in range(cell_cols):
                for row in range(cell_rows):
                    r = row_indices[row]
                    c = col_indices[col]
                    T[r, c] = X[row, col]

    M_c = du.gen_M_c_from_T(T)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = generate_X_L_and_X_D(T,
                                    M_c,
                                    cols_to_views,
                                    row_to_clusters,
                                    seed=get_next_seed())

    return T, M_c, M_r, X_L, X_D, cols_to_views
pl.figure()
burn_in = 400

mi_ests = numpy.zeros(len(widths))
	
datas = []

nr = 0
for w in widths:
	T, mi_est = gen_ring( n, w, SEED=get_next_seed())

	datas.append(T)

	print("num_samples: %i, width: %f" % (n, w))

	M_c = du.gen_M_c_from_T(T,cctypes)
	X_Ls = []
	X_Ds = []

	for ns in range(n_samples):
		state = State.p_State(M_c, T)
		state.transition(n_steps=burn_in)
		X_Ds.append(state.get_X_D())
		X_Ls.append(state.get_X_L())
	
	MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)], n_samples=5000)

	data_d = numpy.transpose(MI)

	if nr == 0:
		data = data_d
Example #24
0
def continuous_imputation_confidence(
        samples, imputed, column_component_suffstats_i, n_steps=100,
        n_chains=1, return_metadata=False):
    # XXX: the confidence in continuous imputation is "the probability that
    # there exists a unimodal summary" which is defined as the proportion of
    # probability mass in the largest mode of a DPMM inferred from the simulate
    # samples. We use crosscat on the samples for a given number of iterations,
    # then calculate the proportion of mass in the largest mode.
    #
    # NOTE: The definition of confidence and its implementation do not agree.
    # The probability of a unimodal summary is P(k=1|X), where k is the number
    # of components in some infinite mixture model. I would describe the
    # current implementation as "Is there a mode with sufficient enough mass
    # that we can ignore the other modes". If this second formulation is to be
    # used, it means that we need to not use the median of all the samples as
    # the imputed value, but the median of the samples of the summary mode,
    # because the summary (the imputed value) should come from the summary
    # mode.
    #
    # There are a lot of problems with this second formulation.
    # 0. SLOW. Like, for real.
    # 1. Non-deterministic. The answer will be different given the same
    #   samples.
    # 2. Inaccurate. Approximate inference about approximate inferences.
    #   In practice confidences on the sample samples could be significantly
    #   different because the Gibbs sampler that underlies crosscat is
    #   susceptible to getting stuck in local maximum. Of course, this could be
    #   mitigated to some extent by using more chains, but things are slow
    #   enough as it is.
    # 3. Confidence (interval) has a distinct meaning to the people who will
    #   be using this software. A unimodal summary does not necessarily mean
    #   that inferences are within an acceptable range. We are going to need to
    #   be loud about this. Maybe there should be a notion of tolerance?
    #
    # An alternative: mutual predictive coverage
    # ------------------------------------------
    # Divide the number of samples in the intersection of the 90% CI's of each
    # component model by the number of samples in the union of the 90% CI's of
    # each component model.

    from crosscat.cython_code import State

    # XXX: assumes samples somes in as a 1-D numpy.array or 1-D list
    num_samples = float(len(samples))
    T = [[x] for x in samples]

    # XXX: This is a higly problematic consequence of the current definition of
    # confidence. If the number of samples is 1, then the confidence is always
    # 1 because there will be exactly 1 mode in the DPMM (recall the DPMM can
    # have, at maximum, as many modes at data points). I figure if we're going
    # to give a bad answer, we shoud give it quickly.
    if num_samples == 1:
        return 1.0

    confs = []
    tlist = ['column_hyperparameters',
             'row_partition_hyperparameters',
             'row_partition_assignments']
    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'])

    if return_metadata:
        X_L_list = []
        X_D_list = []

    for _ in range(n_chains):
        ccstate = State.p_State(M_c, T)
        ccstate.transition(which_transitions=tlist, n_steps=n_steps)

        X_D = ccstate.get_X_D()

        assignment = X_D[0]
        num_cats = max(assignment)+1
        props = numpy.histogram(assignment, num_cats)[0]/num_samples
        confs.append(max(props))

        if return_metadata:
            X_L_list.append(ccstate.get_X_L())
            X_D_list.append(X_D)

    conf = numpy.mean(confs)
    if return_metadata:
        return conf, X_L_list, X_D_list
    else:
        return conf
def gen_data(cctypes, n_rows, cols_to_views, cluster_weights, separation, seed=0, distargs=None, return_structure=False):
	"""	Generates a synthetic data.
		Inputs:
			- cctypes: List of strings. Each entry, i, is the cctype of the 
			column i. ex: cctypes = ['continuous','continuous', 'multinomial']
			- n_rows: integer. the number of rows
			- cols_to_views: List of integers. Each entry, i, is the view, v, 
			to which columns i is assigned. v \in [0,...,n_cols-1].
			ex: cols_to_views = [0, 0, 1]
			- cluster_weights: List of lists of floats. A num_views length list
			of list. Each sublist, W, is a list of cluster weights for the 
			view, thus W should always sum to 1.
			ex (two views, first view has 2 clusters, second view has 3 
			clusters):
			cluster_weights = [[.3, .7], [.25, .5, .25]]
			- separation: list of floats. Each entry, i, is the separation, C,
			of the clusters in view i. C \in [0,1] where 0 is no separation and
			1 is well-separated.
			ex (2 views): separation = [ .5, .7]
			- seed: optional
			- distargs: optional (only if continuous). distargs is n_columns
			length list where each entry is either None or a dict appropriate 
			for the cctype in that column. For a normal feature, the entry 
			should be None, for a multinomial feature, the entry should be a 
			dict with the entry K (the number of categories). 
			- return_structure: (bool, optional). Returns also a dict withe the
			data generation structure included. A dict with keys:
				- component_params:  a n_cols length list of lists. Where each 
				list is a set of component model parameters for each cluster in
				the view to which that column belongs
				- cols_to_views: a list assigning each column to a view
				- rows_to_clusters: a n_views length list of list. Each entry,
				rows_to_clusters[v][r] is the cluster to which all rows in 
				columns belonging to view v are assigned
		Returns:
			T, M_c
		Example:
			>>> cctypes = ['continuous','continuous','multinomial','continuous','multinomial']
			>>> disargs = [None, None, dict(K=5), None, dict(K=2)]
			>>> n_rows = 10
			>>> cols_to_views = [0, 0, 1, 1, 2]
			>>> cluster_weights = [[.3, .7],[.5, .5],[.2, .3, .5]]
			>>> separation = [.9, .6, .9]
			>>> T, M_c = gen_data(cctypes, n_rows, cols_to_views, cluster_weights,
				separation, seed=0, distargs=distargs)
	"""

	# check Inputs
	if not isinstance(n_rows, int):
		raise TypeError("n_rows should be an integer")

	if not isinstance(cctypes, list):
		raise TypeError("cctypes should be a list")

	n_cols_cctypes = len(cctypes)
	for cctype in cctypes:
		if not isinstance(cctype, str):
			raise TypeError("cctypes should be a list of strings")

		# NOTE: will have to update when new component models are added
		if cctype not in ['continuous', 'multinomial']:
			raise ValueError("invalid cctypein cctypes: %s." % cctype)

	if not isinstance(cols_to_views, list):
		raise TypeError("cols_to_views should be a list")

	if len(cols_to_views) != n_cols_cctypes:
		raise ValueError("number of columns in cctypes does not match number\
		 of columns in cols_to_views")

	if min(cols_to_views) != 0:
		raise ValueError("min value of cols_to_views should be 0")

	n_views_cols_to_views = max(cols_to_views) + 1

	set_ctv = set(cols_to_views)
	if len(set_ctv) != n_views_cols_to_views:
		raise ValueError("View indices skipped in cols_to_views")

	# check cluster weights
	if not isinstance(cluster_weights, list):
		raise TypeError("cluster_weights should be a list")

	if n_views_cols_to_views != len(cluster_weights):
		raise ValueError("The number of views in cols_to_views and \
			cluster_weights do not agree.")

	# check each set of weights
	for W in cluster_weights:
		if not isinstance(W, list):
			raise TypeError("cluster_weights should be a list of lists")
		if math.fabs(sum(W)-1.0) > .0000001:
			raise ValueError("each vector of weights should sum to 1")

	if not isinstance(separation, list):
		raise TypeError("separation should be a list")

	if len(separation) != n_views_cols_to_views:
		raise ValueError("number of view in separation and cols_to_views do not agree")

	for c in separation:
		if not isinstance(c, float) or c > 1.0 or c < 0.0:
			raise ValueError("each value in separation should be a float from 0 to 1")

	num_views = len(separation)
	n_cols = len(cols_to_views)

	# check the cctypes vs the distargs
	if distargs is None:
		distargs = [None for i in range(n_cols)]

	if not isinstance(distargs, list):
		raise TypeError("distargs should be a list")

	if len(distargs) != n_cols:
		raise ValueError("distargs should have an entry for each column")

	for i in range(n_cols):
		if cctypes[i] == 'continuous':
			if distargs[i] is not None:
				raise ValueError("distargs entry for 'continuous' cctype should be None")
		elif cctypes[i] == 'multinomial':
			if not isinstance(distargs[i], dict):
				raise TypeError("ditargs for cctype 'multinomial' should be a dict")
			if len(distargs[i].keys()) != 1:
				raise KeyError("distargs for cctype 'multinomial' should have one key, 'K'")
			if 'K' not in distargs[i].keys():
				raise KeyError("distargs for cctype 'multinomial' should have the key 'K'")
		else:
			raise ValueError("invalid cctypein cctypes: %s." % cctypes[i])

	random.seed(seed)
	numpy.random.seed(seed)

	# Generate the rows to categories partitions (mutlinomial)
	rows_to_clusters = []
	for W in cluster_weights:

		cW = list(W)
		for i in range(1, len(cW)):
			cW[i] += cW[i-1]

		K = len(cW)

		rows_to_clusters_view = range(K)
		for r in range(K,n_rows):
			rows_to_clusters_view.append(p_draw(cW))

		random.shuffle(rows_to_clusters_view)
		assert len(rows_to_clusters_view) == n_rows

		rows_to_clusters.append(rows_to_clusters_view)


	get_next_seed = lambda : random.randrange(2147483647)

	# start generating the data
	data_table = numpy.zeros((n_rows, n_cols))
	component_params = []
	for col in range(n_cols):
	
		view = cols_to_views[col]

		# get the number of cluster in view
		num_clusters = len(cluster_weights[view])

		cctype = cctypes[col]

		C = separation[view]

		# generate a set of C-separated component model parameters 
		component_parameters = generate_separated_model_parameters(cctype, C,
			num_clusters, get_next_seed, distargs=distargs[col])

		component_params.append(component_parameters)

		# get the data generation function
		gen = get_data_generator[cctype]
		for row in range(n_rows):
			# get the cluster this 
			cluster = rows_to_clusters[view][row]
			params = component_parameters[cluster]
			x = gen(params, 1, gen_seed=get_next_seed())[0]
			data_table[row,col] = x


	T = data_table.tolist()
	M_c = du.gen_M_c_from_T(T, cctypes=cctypes)

	if return_structure:
		structure = dict()
		structure['component_params'] = component_params
		structure['cols_to_views'] = cols_to_views
		structure['rows_to_clusters'] = rows_to_clusters
		return T, M_c, structure
	else:
		return T, M_c
Example #26
0
def run_test(args):

    rho = args.rho
    num_times = args.num_times
    min_num_rows = args.min_num_rows
    max_num_rows = args.max_num_rows
    n_grid = args.n_grid
    filename = args.filename
    discrete = args.discrete

    num_samples = []
    for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist():
        num_samples.append(int(ns))

    variances = []

    burn_in = 200

    MIs = numpy.zeros((num_times, len(num_samples)))

    mi_diff = numpy.zeros((len(num_samples), num_times))

    if not discrete:
        T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho)
        cctypes = ["continuous"] * 2
    else:
        T, true_mi, external_mi = gen_correlated_data_discrete(num_samples[-1], rho)
        cctypes = ["multinomial"] * 2

    data_subs = []

    n_index = 0
    for n in num_samples:
        T_sub = numpy.copy(T[0 : n - 1, :])

        data = []

        data_subs.append(T_sub)

        print("%i: " % n)
        for t in range(num_times):
            M_c = du.gen_M_c_from_T(T_sub, cctypes)
            state = State.p_State(M_c, T_sub)
            state.transition(n_steps=burn_in)
            X_D = state.get_X_D()
            X_L = state.get_X_L()

            MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000)

            mi_diff[n_index, t] = true_mi - MI[0][0]

            print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0]))

            MIs[t, n_index] = MI[0][0]

        n_index += 1

    if discrete:
        dtype_str = "discrete"
    else:
        dtype_str = "continuous"

    basefilename = filename + str(int(time.time()))
    figname = basefilename + ".png"
    datname = basefilename + "_DATA.png"

    pl.figure

    # plot data
    # pl.subplot(1,2,1)
    pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4))
    i = 0
    for T_s in data_subs:
        pl.subplot(1, len(data_subs), i + 1)
        num_rows = num_samples[i]
        if discrete:
            heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10)
            extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
            pl.imshow(heatmap, extent=extent, interpolation="nearest")
        else:
            pl.scatter(T_s[:, 0], T_s[:, 1], alpha=0.3, s=81)
        pl.title("#r: " + str(num_rows))

        i += 1

    pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(datname)
    pl.clf()

    pl.figure(tight_layout=True, figsize=(5, 4))
    # plot convergence
    # pl.subplot(1,2,2)
    # standard deviation
    stderr = numpy.std(MIs, axis=0)  # /(float(num_times)**.5)
    mean = numpy.mean(MIs, axis=0)
    pl.errorbar(num_samples, mean, yerr=stderr, c="blue")
    pl.plot(num_samples, mean, c="blue", alpha=0.8, label="mean MI")
    pl.plot(num_samples, [true_mi] * len(num_samples), color="red", alpha=0.8, label="true MI")
    pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, 0.5, 0.5), alpha=0.8, label="external MI")
    pl.title("convergence")
    pl.xlabel("#rows in X (log)")
    pl.ylabel("CrossCat MI - true MI")

    pl.legend(loc=0, prop={"size": 8})
    pl.gca().set_xscale("log")

    # save output
    pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(figname)