Example #1
0
def GenDataFromPartitions(col_part,row_parts,mean_gen,std_gen,std_data):
	n_cols = len(col_part)
	n_rows = row_parts.shape[1]

	seed = int(time()*100)
	np.random.seed(seed)

	T = np.zeros((n_rows,n_cols))

	for col in range(n_cols):
		view = col_part[col]
		row_part = row_parts[view,:]
		cats = max(row_part)+1
		for cat in range(cats):
			row_dex = np.nonzero(row_part==cat)[0]
			n_rows_cat = len(row_dex)
			mean = np.random.normal(mean_gen,std_gen)
			X = np.random.normal(mean,std_data,(n_rows_cat,1))
			i = 0
			for row in row_dex:
				T[row,col] = X[i]
				i += 1

	
	T = T.tolist()
	M_r = du.gen_M_r_from_T(T)
	M_c = du.gen_M_c_from_T(T)

	return T, M_r, M_c
Example #2
0
def generate_multinomial_data(next_seed, n_cols, n_rows, n_views):
    # generate the partitions
    random.seed(next_seed)

    cols_to_views = [0 for _ in range(n_cols)]
    rows_in_views_to_cols = []
    for view in range(n_views):
        partition = eu.CRP(n_rows, 2.0)
        random.shuffle(partition)
        rows_in_views_to_cols.append(partition)

    # generate the data
    data = numpy.zeros((n_rows, n_cols), dtype=float)
    for col in range(n_cols):
        view = cols_to_views[col]
        for row in range(n_rows):
            cluster = rows_in_views_to_cols[view][row]
            data[row, col] = cluster

    T = data.tolist()
    M_r = du.gen_M_r_from_T(T)
    M_c = du.gen_M_c_from_T(T)

    T, M_c = du.convert_columns_to_multinomial(T, M_c, list(range(n_cols)))

    return T, M_r, M_c
Example #3
0
def _forward_sample_from_prior(inf_seed_and_n_samples, M_c, T,
        probe_columns=(0,),
        ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(),
        S_GRID=(), MU_GRID=(),
        N_GRID=default_n_grid,
        ):
    inf_seed, n_samples = inf_seed_and_n_samples
    T = numpy.zeros(numpy.array(T).shape).tolist()
    M_r = du.gen_M_r_from_T(T)
    engine = LE.LocalEngine(inf_seed)
    diagnostics_data = collections.defaultdict(list)
    diagnostics_funcs = None
    for sample_idx in range(n_samples):
        X_L, X_D = engine.initialize(M_c, M_r, T,
                ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
                COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
                S_GRID=S_GRID,
                MU_GRID=MU_GRID,
                N_GRID=N_GRID,
                )
        if diagnostics_funcs is None:
            diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns)
        diagnostics_data = collect_diagnostics(X_L, diagnostics_data,
                diagnostics_funcs)
        pass
    return diagnostics_data
Example #4
0
def GenDataFromPartitions(col_part, row_parts, mean_gen, std_gen, std_data,
                          seed):
    n_cols = len(col_part)
    n_rows = row_parts.shape[1]

    rng = np.random.RandomState(seed)

    T = np.zeros((n_rows, n_cols))

    for col in range(n_cols):
        view = col_part[col]
        row_part = row_parts[view, :]
        cats = max(row_part) + 1
        for cat in range(cats):
            row_dex = np.nonzero(row_part == cat)[0]
            n_rows_cat = len(row_dex)
            mean = rng.normal(mean_gen, std_gen)
            X = rng.normal(mean, std_data, (n_rows_cat, 1))
            i = 0
            for row in row_dex:
                T[row, col] = X[i]
                i += 1

    T = T.tolist()
    M_r = du.gen_M_r_from_T(T)
    M_c = du.gen_M_c_from_T(T)

    return T, M_r, M_c
def generate_multinomial_data(next_seed,n_cols,n_rows,n_views):
	# generate the partitions
	random.seed(next_seed)
	
	cols_to_views = [0 for _ in range(n_cols)]
	rows_in_views_to_cols = []
	for view in range(n_views):
		partition = eu.CRP(n_rows,2.0)
		random.shuffle(partition)
		rows_in_views_to_cols.append(partition)

	# generate the data
	data = numpy.zeros((n_rows,n_cols),dtype=float)
	for col in range(n_cols):
		view = cols_to_views[col]
		for row in range(n_rows):
			cluster = rows_in_views_to_cols[view][row]
			data[row,col] = cluster

	T = data.tolist()
	M_r = du.gen_M_r_from_T(T)
	M_c = du.gen_M_c_from_T(T)

	T, M_c = du.convert_columns_to_multinomial(T, M_c, range(n_cols))

	return T, M_r, M_c
Example #6
0
def _forward_sample_from_prior(inf_seed_and_n_samples, M_c, T,
        probe_columns=(0,),
        ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(),
        S_GRID=(), MU_GRID=(),
        N_GRID=default_n_grid,
        ):
    inf_seed, n_samples = inf_seed_and_n_samples
    T = numpy.zeros(numpy.array(T).shape).tolist()
    M_r = du.gen_M_r_from_T(T)
    engine = LE.LocalEngine(inf_seed)
    diagnostics_data = collections.defaultdict(list)
    diagnostics_funcs = None
    for sample_idx in range(n_samples):
        X_L, X_D = engine.initialize(M_c, M_r, T,
                ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
                COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
                S_GRID=S_GRID,
                MU_GRID=MU_GRID,
                N_GRID=N_GRID,
                )
        if diagnostics_funcs is None:
            diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns)
        diagnostics_data = collect_diagnostics(X_L, diagnostics_data,
                diagnostics_funcs)
        pass
    return diagnostics_data
Example #7
0
def run_posterior_chain(seed, M_c, T, num_iters,
        probe_columns=(0,),
        ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(),
        S_GRID=(), MU_GRID=(),
        N_GRID=default_n_grid,
        plot_rand_idx=None,
        ):
    plot_rand_idx = arbitrate_plot_rand_idx(plot_rand_idx, num_iters)
    engine = LE.LocalEngine(seed)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = engine.initialize(M_c, M_r, T, 'from_the_prior',
            ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
            COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
            S_GRID=S_GRID,
            MU_GRID=MU_GRID,
            N_GRID=N_GRID,
            )
    diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns)
    diagnostics_data = collections.defaultdict(list)
    for idx in range(num_iters):
        M_c, T, X_L, X_D = run_posterior_chain_iter(engine, M_c, T, X_L, X_D, diagnostics_data,
                diagnostics_funcs,
                ROW_CRP_ALPHA_GRID,
                COLUMN_CRP_ALPHA_GRID,
                S_GRID, MU_GRID,
                N_GRID=N_GRID,
                )
        if idx == plot_rand_idx:
            # This DOESN'T work with multithreading
            filename = 'T_%s' % idx
            pu.plot_views(numpy.array(T), X_D, X_L, M_c, filename=filename,
                    dir='./', close=True, format=image_format)
            pass
        pass
    return diagnostics_data
Example #8
0
def run_posterior_chain(
        seed,
        M_c,
        T,
        num_iters,
        probe_columns=(0, ),
        ROW_CRP_ALPHA_GRID=(),
        COLUMN_CRP_ALPHA_GRID=(),
        S_GRID=(),
        MU_GRID=(),
        N_GRID=default_n_grid,
        CT_KERNEL=0,
        plot_rand_idx=None,
):
    plot_rand_idx = arbitrate_plot_rand_idx(plot_rand_idx, num_iters)
    engine = LE.LocalEngine(seed)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = engine.initialize(
        M_c,
        M_r,
        T,
        'from_the_prior',
        ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
        S_GRID=S_GRID,
        MU_GRID=MU_GRID,
        N_GRID=N_GRID,
    )
    diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns)
    diagnostics_data = collections.defaultdict(list)
    for idx in range(num_iters):
        M_c, T, X_L, X_D = run_posterior_chain_iter(
            engine,
            M_c,
            T,
            X_L,
            X_D,
            diagnostics_data,
            diagnostics_funcs,
            ROW_CRP_ALPHA_GRID,
            COLUMN_CRP_ALPHA_GRID,
            S_GRID,
            MU_GRID,
            N_GRID=N_GRID,
            CT_KERNEL=CT_KERNEL,
        )
        if idx == plot_rand_idx:
            # This DOESN'T work with multithreading
            filename = 'T_%s' % idx
            pu.plot_views(numpy.array(T),
                          X_D,
                          X_L,
                          M_c,
                          filename=filename,
                          dir='./',
                          close=True,
                          format=image_format)
            pass
        pass
    return diagnostics_data
Example #9
0
# create the data
if True:
    T, M_r, M_c = du.gen_factorial_data_objects(
        gen_seed, num_clusters,
        num_cols, num_rows, num_splits,
        max_mean=max_mean, max_std=max_std,
        )
else:
    with open('SynData2.csv') as fh:
        import numpy
        import csv
        T = numpy.array([
                row for row in csv.reader(fh)
                ], dtype=float).tolist()
        M_r = du.gen_M_r_from_T(T)
        M_c = du.gen_M_c_from_T(T)


# create the state
p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed)
p_State.plot_T(filename='T')

# transition the sampler
print("p_State.get_marginal_logp():", p_State.get_marginal_logp())
for transition_idx in range(num_transitions):
    print("transition #: %s" % transition_idx)
    p_State.transition()
    counts = [
        view_state['row_partition_model']['counts']
        for view_state in p_State.get_X_L()['view_state']
def test_kl_divergence_as_a_function_of_N_and_transitions():

    n_clusters = 3
    n_chains = 8
    do_times = 4

    # N_list = [25, 50, 100, 250, 500, 1000, 2000]
    N_list = [25, 50, 100, 175, 250, 400, 500]

    # max_transitions = 500
    max_transitions = 500
    transition_interval = 50
    t_iterations = max_transitions / transition_interval

    cctype = "continuous"
    cluster_weights = [1.0 / float(n_clusters)] * n_clusters
    separation = 0.5

    get_next_seed = lambda: random.randrange(2147483647)

    # data grid
    KLD = numpy.zeros((len(N_list), t_iterations + 1))

    for _ in range(do_times):
        for n in range(len(N_list)):
            N = N_list[n]
            T, M_c, struc = sdg.gen_data(
                [cctype],
                N,
                [0],
                [cluster_weights],
                [separation],
                seed=get_next_seed(),
                distargs=[None],
                return_structure=True,
            )

            M_r = du.gen_M_r_from_T(T)

            # precompute the support and pdf to speed up calculation of KL divergence
            support = qtu.get_mixture_support(
                cctype, ccmext.p_ContinuousComponentModel, struc["component_params"][0], nbins=1000, support=0.995
            )
            true_log_pdf = qtu.get_mixture_pdf(
                support, ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights
            )

            # intialize a multiprocessing engine
            mstate = mpe.MultiprocessingEngine(cpu_count=8)
            X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains)

            # kl_divergences
            klds = numpy.zeros(len(X_L_list))

            for i in range(len(X_L_list)):
                X_L = X_L_list[i]
                X_D = X_D_list[i]
                KLD[n, 0] += qtu.KL_divergence(
                    ccmext.p_ContinuousComponentModel,
                    struc["component_params"][0],
                    cluster_weights,
                    M_c,
                    X_L,
                    X_D,
                    n_samples=1000,
                    support=support,
                    true_log_pdf=true_log_pdf,
                )

                # run transition_interval then take a reading. Rinse and repeat.
            for t in range(t_iterations):
                X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval)

                for i in range(len(X_L_list)):
                    X_L = X_L_list[i]
                    X_D = X_D_list[i]
                    KLD[n, t + 1] += qtu.KL_divergence(
                        ccmext.p_ContinuousComponentModel,
                        struc["component_params"][0],
                        cluster_weights,
                        M_c,
                        X_L,
                        X_D,
                        n_samples=1000,
                        support=support,
                        true_log_pdf=true_log_pdf,
                    )

    KLD /= float(n_chains * do_times)

    pylab.subplot(1, 3, 1)
    pylab.contourf(list(range(0, max_transitions + 1, transition_interval)), N_list, KLD)
    pylab.title("KL divergence")
    pylab.ylabel("N")
    pylab.xlabel("# transitions")

    pylab.subplot(1, 3, 2)
    m_N = numpy.mean(KLD, axis=1)
    e_N = numpy.std(KLD, axis=1) / float(KLD.shape[1]) ** -0.5
    pylab.errorbar(N_list, m_N, yerr=e_N)
    pylab.title("KL divergence by N")
    pylab.xlabel("N")
    pylab.ylabel("KL divergence")

    pylab.subplot(1, 3, 3)
    m_t = numpy.mean(KLD, axis=0)
    e_t = numpy.std(KLD, axis=0) / float(KLD.shape[0]) ** -0.5
    pylab.errorbar(list(range(0, max_transitions + 1, transition_interval)), m_t, yerr=e_t)
    pylab.title("KL divergence by transitions")
    pylab.xlabel("trasition")
    pylab.ylabel("KL divergence")

    pylab.show()

    return KLD
def generate_correlated_state(num_rows,
                              num_cols,
                              num_views,
                              num_clusters,
                              mean_range,
                              corr,
                              seed=0):
    #

    assert (num_clusters <= num_rows)
    assert (num_views <= num_cols)
    T = numpy.zeros((num_rows, num_cols))

    random.seed(seed)
    numpy.random.seed(seed=seed)
    get_next_seed = lambda: random.randrange(2147483647)

    # generate an assignment of columns to views (uniform)
    cols_to_views = range(num_views)
    view_counts = numpy.ones(num_views, dtype=int)
    for i in range(num_views, num_cols):
        r = random.randrange(num_views)
        cols_to_views.append(r)
        view_counts[r] += 1

    random.shuffle(cols_to_views)

    assert (len(cols_to_views) == num_cols)
    assert (max(cols_to_views) == num_views - 1)

    # for each view, generate an assignment of rows to num_clusters
    row_to_clusters = []
    cluster_counts = []
    for view in range(num_views):
        row_to_cluster = range(num_clusters)
        cluster_counts_i = numpy.ones(num_clusters, dtype=int)
        for i in range(num_clusters, num_rows):
            r = random.randrange(num_clusters)
            row_to_cluster.append(r)
            cluster_counts_i[r] += 1

        random.shuffle(row_to_cluster)

        assert (len(row_to_cluster) == num_rows)
        assert (max(row_to_cluster) == num_clusters - 1)

        row_to_clusters.append(row_to_cluster)
        cluster_counts.append(cluster_counts_i)

    assert (len(row_to_clusters) == num_views)

    # generate the correlated data
    for view in range(num_views):
        for cluster in range(num_clusters):
            cell_cols = view_counts[view]
            cell_rows = cluster_counts[view][cluster]
            means = numpy.random.uniform(-mean_range / 2.0, mean_range / 2.0,
                                         cell_cols)
            X = generate_correlated_data(cell_rows,
                                         cell_cols,
                                         means,
                                         corr,
                                         seed=get_next_seed())
            # get the indices of the columns in this view
            col_indices = numpy.nonzero(numpy.array(cols_to_views) == view)[0]
            # get the indices of the rows in this view and this cluster
            row_indices = numpy.nonzero(
                numpy.array(row_to_clusters[view]) == cluster)[0]
            # insert the data
            for col in range(cell_cols):
                for row in range(cell_rows):
                    r = row_indices[row]
                    c = col_indices[col]
                    T[r, c] = X[row, col]

    M_c = du.gen_M_c_from_T(T)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = generate_X_L_and_X_D(T,
                                    M_c,
                                    cols_to_views,
                                    row_to_clusters,
                                    seed=get_next_seed())

    return T, M_c, M_r, X_L, X_D, cols_to_views
def generate_correlated_state(num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=0):
    #

    assert(num_clusters <= num_rows)
    assert(num_views <= num_cols)
    T = numpy.zeros((num_rows, num_cols))

    random.seed(seed)
    numpy.random.seed(seed=seed)
    get_next_seed = lambda : random.randrange(2147483647)

    # generate an assignment of columns to views (uniform)
    cols_to_views = range(num_views)
    view_counts = numpy.ones(num_views, dtype=int)
    for i in range(num_views, num_cols):
        r = random.randrange(num_views)
        cols_to_views.append(r)
        view_counts[r] += 1

    random.shuffle(cols_to_views)

    assert(len(cols_to_views) == num_cols)
    assert(max(cols_to_views) == num_views-1)

    # for each view, generate an assignment of rows to num_clusters
    row_to_clusters = []
    cluster_counts = []
    for view in range(num_views):
        row_to_cluster = range(num_clusters)
        cluster_counts_i = numpy.ones(num_clusters,dtype=int)
        for i in range(num_clusters, num_rows):
            r = random.randrange(num_clusters)
            row_to_cluster.append(r)
            cluster_counts_i[r] += 1

        random.shuffle(row_to_cluster)

        assert(len(row_to_cluster) == num_rows)
        assert(max(row_to_cluster) == num_clusters-1)

        row_to_clusters.append(row_to_cluster)
        cluster_counts.append(cluster_counts_i)

    assert(len(row_to_clusters) == num_views)

    # generate the correlated data
    for view in range(num_views):
        for cluster in range(num_clusters):
            cell_cols = view_counts[view]
            cell_rows = cluster_counts[view][cluster]
            means = numpy.random.uniform(-mean_range/2.0,mean_range/2.0,cell_cols)
            X =  generate_correlated_data(cell_rows, cell_cols, means, corr, seed=get_next_seed())
            # get the indices of the columns in this view
            col_indices = numpy.nonzero(numpy.array(cols_to_views)==view)[0]
            # get the indices of the rows in this view and this cluster
            row_indices = numpy.nonzero(numpy.array(row_to_clusters[view])==cluster)[0]
            # insert the data
            for col in range(cell_cols):
                for row in range(cell_rows):
                    r = row_indices[row]
                    c = col_indices[col]
                    T[r,c] = X[row,col]


    M_c = du.gen_M_c_from_T(T)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=get_next_seed())

    return  T, M_c, M_r, X_L, X_D, cols_to_views
def test_kl_divergence_as_a_function_of_N_and_transitions():

	n_clusters = 3
	n_chains = 8
	do_times = 4

	# N_list = [25, 50, 100, 250, 500, 1000, 2000]
	N_list = [25, 50, 100, 175, 250, 400, 500]

	# max_transitions = 500
	max_transitions = 500
	transition_interval = 50
	t_iterations = max_transitions/transition_interval

	cctype = 'continuous'
	cluster_weights = [1.0/float(n_clusters)]*n_clusters
	separation = .5

	get_next_seed = lambda : random.randrange(2147483647)

	# data grid
	KLD = numpy.zeros((len(N_list), t_iterations+1))

	for _ in range(do_times):
		for n in range(len(N_list)):
			N = N_list[n]
			T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], 
							[separation], seed=get_next_seed(), distargs=[None],
							return_structure=True)

			M_r = du.gen_M_r_from_T(T)

			# precompute the support and pdf to speed up calculation of KL divergence
			support = qtu.get_mixture_support(cctype, 
						ccmext.p_ContinuousComponentModel, 
						struc['component_params'][0], nbins=1000, support=.995)
			true_log_pdf = qtu.get_mixture_pdf(support,
						ccmext.p_ContinuousComponentModel, 
						struc['component_params'][0],cluster_weights)

			# intialize a multiprocessing engine
			mstate = mpe.MultiprocessingEngine(cpu_count=8)
			X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains)

			# kl_divergences
			klds = numpy.zeros(len(X_L_list))

			for i in range(len(X_L_list)):
				X_L = X_L_list[i]
				X_D = X_D_list[i]
				KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel,
						struc['component_params'][0], cluster_weights, M_c, 
						X_L, X_D, n_samples=1000, support=support, 
						true_log_pdf=true_log_pdf)


			# run transition_interval then take a reading. Rinse and repeat.
			for t in range( t_iterations ):
				X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list,
							n_steps=transition_interval)

				for i in range(len(X_L_list)):
					X_L = X_L_list[i]
					X_D = X_D_list[i]
					KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel,
							struc['component_params'][0], cluster_weights, M_c, 
							X_L, X_D, n_samples=1000, support=support, 
							true_log_pdf=true_log_pdf)


	KLD /= float(n_chains*do_times)

	pylab.subplot(1,3,1)
	pylab.contourf(list(range(0,max_transitions+1,transition_interval), N_list, KLD))
	pylab.title('KL divergence')
	pylab.ylabel('N')
	pylab.xlabel('# transitions')


	pylab.subplot(1,3,2)
	m_N = numpy.mean(KLD,axis=1)
	e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5
	pylab.errorbar(N_list,  m_N, yerr=e_N)
	pylab.title('KL divergence by N')
	pylab.xlabel('N')
	pylab.ylabel('KL divergence')

	pylab.subplot(1,3,3)
	m_t = numpy.mean(KLD,axis=0)
	e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5
	pylab.errorbar(list(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t))
	pylab.title('KL divergence by transitions')
	pylab.xlabel('trasition')
	pylab.ylabel('KL divergence')

	pylab.show()

	return KLD