def quick_le(seed, n_chains=1): # Specify synthetic dataset structure. cctypes = [ 'continuous', 'continuous', 'multinomial', 'multinomial', 'continuous' ] distargs = [None, None, dict(K=9), dict(K=7), None] cols_to_views = [0, 0, 0, 1, 1] separation = [0.6, 0.9] cluster_weights = [[.2, .3, .5], [.9, .1]] # Obtain the generated dataset and metadata/ T, M_c, M_r = sdg.gen_data(cctypes, N_ROWS, cols_to_views, cluster_weights, separation, seed=seed, distargs=distargs, return_structure=True) # Create, initialize, and analyze the engine. engine = LocalEngine() X_L, X_D = engine.initialize(M_c, M_r, T, seed, n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def runner(config): generate_args, analyze_args, inf_seed = _munge_config(config) # generate synthetic data T, M_c, M_r, X_L, X_D = du.generate_clean_state(max_mean=10, max_std=1, **generate_args) table_shape = map(len, (T, T[0])) start_dims = du.get_state_shape(X_L) # run engine with do_timing = True engine = LocalEngine(inf_seed) X_L, X_D, (elapsed_secs,) = engine.analyze(M_c, T, X_L, X_D, do_timing=True, **analyze_args ) # end_dims = du.get_state_shape(X_L) same_shape = start_dims == end_dims summary = dict( elapsed_secs=elapsed_secs, same_shape=same_shape, ) ret_dict = dict( config=config, summary=summary, table_shape=table_shape, start_dims=start_dims, end_dims=end_dims, ) return ret_dict
def get_generative_clustering(M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views): from crosscat.LocalEngine import LocalEngine import crosscat.cython_code.State as State # NOTE: this function only works because State.p_State doesn't use # column_component_suffstats num_rows = len(T) num_cols = len(T[0]) X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters)) gen_X_D = [ X_D_helper[numpy.argsort(data_inverse_permutation_index)] for data_inverse_permutation_index in data_inverse_permutation_indices ] gen_X_L_assignments = numpy.repeat(range(num_views), (num_cols / num_views)) # initialize to generate an X_L to manipulate local_engine = LocalEngine() bad_X_L, bad_X_D = local_engine.initialize(M_c, M_r, T, initialization='apart') bad_X_L['column_partition']['assignments'] = gen_X_L_assignments # manually constrcut state in in generative configuration state = State.p_State(M_c, T, bad_X_L, gen_X_D) gen_X_L = state.get_X_L() gen_X_D = state.get_X_D() # run inference on hyperparameters to leave them in a reasonable state kernel_list = ( 'row_partition_hyperparameters', 'column_hyperparameters', 'column_partition_hyperparameter', ) gen_X_L, gen_X_D = local_engine.analyze(M_c, T, gen_X_L, gen_X_D, n_steps=1, kernel_list=kernel_list) # return gen_X_L, gen_X_D
def runner(config): # helpers def munge_config(config): kwargs = config.copy() kwargs['num_splits'] = kwargs.pop('num_views') n_steps = kwargs.pop('n_steps') n_test = kwargs.pop('n_test') return kwargs, n_steps, n_test def calc_ll(T, p_State): log_likelihoods = map(p_State.calc_row_predictive_logp, T) mean_log_likelihood = numpy.mean(log_likelihoods) return mean_log_likelihood def gen_data(**kwargs): T, M_c, M_r, gen_X_L, gen_X_D = du.generate_clean_state(**kwargs) # engine = LocalEngine() sampled_T = gu.sample_T(engine, M_c, T, gen_X_L, gen_X_D) T_test = random.sample(sampled_T, n_test) gen_data_ll = ctu.calc_mean_test_log_likelihood( M_c, T, gen_X_L, gen_X_D, T) gen_test_set_ll = ctu.calc_mean_test_log_likelihood( M_c, T, gen_X_L, gen_X_D, T_test) # return T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll kwargs, n_steps, n_test = munge_config(config) T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll = gen_data(**kwargs) # set up to run inference calc_data_ll = partial(calc_ll, T) calc_test_set_ll = partial(calc_ll, T_test) diagnostic_func_dict = dict( data_ll=calc_data_ll, test_set_ll=calc_test_set_ll, ) # run inference engine = LocalEngine() X_L, X_D = engine.initialize(M_c, M_r, T) X_L, X_D, diagnostics_dict = engine.analyze( M_c, T, X_L, X_D, do_diagnostics=diagnostic_func_dict, n_steps=n_steps) # package result final_data_ll = diagnostics_dict['data_ll'][-1][-1] final_test_set_ll = diagnostics_dict['test_set_ll'][-1][-1] summary = dict( gen_data_ll=gen_data_ll, gen_test_set_ll=gen_test_set_ll, final_data_ll=final_data_ll, final_test_set_ll=final_test_set_ll, ) result = dict( config=config, summary=summary, diagnostics_dict=diagnostics_dict, ) return result
def runner(config): # helpers def munge_config(config): kwargs = config.copy() kwargs['num_splits'] = kwargs.pop('num_views') n_steps = kwargs.pop('n_steps') n_test = kwargs.pop('n_test') return kwargs, n_steps, n_test def calc_ll(T, p_State): log_likelihoods = map(p_State.calc_row_predictive_logp, T) mean_log_likelihood = numpy.mean(log_likelihoods) return mean_log_likelihood def gen_data(**kwargs): T, M_c, M_r, gen_X_L, gen_X_D = du.generate_clean_state(**kwargs) # engine = LocalEngine() sampled_T = gu.sample_T(engine, M_c, T, gen_X_L, gen_X_D) T_test = random.sample(sampled_T, n_test) gen_data_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T) gen_test_set_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T_test) # return T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll kwargs, n_steps, n_test = munge_config(config) T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll = gen_data(**kwargs) # set up to run inference calc_data_ll = partial(calc_ll, T) calc_test_set_ll = partial(calc_ll, T_test) diagnostic_func_dict = dict( data_ll=calc_data_ll, test_set_ll=calc_test_set_ll, ) # run inference engine = LocalEngine() X_L, X_D = engine.initialize(M_c, M_r, T) X_L, X_D, diagnostics_dict = engine.analyze(M_c, T, X_L, X_D, do_diagnostics=diagnostic_func_dict, n_steps=n_steps) # package result final_data_ll = diagnostics_dict['data_ll'][-1][-1] final_test_set_ll = diagnostics_dict['test_set_ll'][-1][-1] summary = dict( gen_data_ll=gen_data_ll, gen_test_set_ll=gen_test_set_ll, final_data_ll=final_data_ll, final_test_set_ll=final_test_set_ll, ) result = dict( config=config, summary=summary, diagnostics_dict=diagnostics_dict, ) return result
def quick_le(seed, n_chains=1): # Specify synthetic dataset structure. cctypes = ['continuous', 'continuous', 'multinomial', 'multinomial', 'continuous'] distargs = [None, None, dict(K=9), dict(K=7), None] cols_to_views = [0, 0, 0, 1, 1] separation = [0.6, 0.9] cluster_weights = [[.2, .3, .5],[.9, .1]] # Obtain the generated dataset and metadata/ T, M_c, M_r = sdg.gen_data(cctypes, N_ROWS, cols_to_views, cluster_weights, separation, seed=seed, distargs=distargs, return_structure=True) # Create, initialize, and analyze the engine. engine = LocalEngine(seed=seed) X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def gen_data(**kwargs): T, M_c, M_r, gen_X_L, gen_X_D = du.generate_clean_state(**kwargs) # engine = LocalEngine() sampled_T = gu.sample_T(engine, M_c, T, gen_X_L, gen_X_D) T_test = random.sample(sampled_T, n_test) gen_data_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T) gen_test_set_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T_test) # return T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll
from crosscat.LocalEngine import LocalEngine import crosscat.utils.data_utils as data_utils data_filename = 'T.csv' inference_seed = 0 num_full_transitions = 10 # read the data table into internal json representation data_table, row_metadata, column_metadata, header = \ data_utils.read_data_objects(data_filename) # create an engine to run analysis, inference engine = LocalEngine(seed=inference_seed) # initialize markov chain samples initial_latent_state, initial_latent_state_clustering = \ engine.initialize(column_metadata, row_metadata, data_table) # run markov chain transition kernels on samples latent_state, latent_state_clustering = engine.analyze(column_metadata, data_table, initial_latent_state, initial_latent_state_clustering, n_steps=num_full_transitions)
from crosscat.LocalEngine import LocalEngine import crosscat.utils.data_utils as data_utils data_filename = "T.csv" inference_seed = 0 num_full_transitions = 10 # read the data table into internal json representation data_table, row_metadata, column_metadata, header = data_utils.read_data_objects(data_filename) # create an engine to run analysis, inference engine = LocalEngine(seed=inference_seed) # initialize markov chain samples initial_latent_state, initial_latent_state_clustering = engine.initialize(column_metadata, row_metadata, data_table) # run markov chain transition kernels on samples latent_state, latent_state_clustering = engine.analyze( column_metadata, data_table, initial_latent_state, initial_latent_state_clustering, n_steps=num_full_transitions )
def transition( state, N=None, S=None, kernels=None, rowids=None, cols=None, seed=None, checkpoint=None, progress=None): """Runs full Gibbs sweeps of all kernels on the cgpm.state.State object. Permittable kernels: 'column_partition_hyperparameter' 'column_partition_assignments' 'column_hyperparameters' 'row_partition_hyperparameters' 'row_partition_assignments' """ if seed is None: seed = 1 if kernels is None: kernels = () if (progress is None) or progress: progress = _progress if N is None and S is None: n_steps = 1 max_time = -1 if N is not None and S is None: n_steps = N max_time = -1 elif S is not None and N is None: # This is a hack, lovecat has no way to specify just max_seconds. n_steps = 150000 max_time = S elif S is not None and N is not None: n_steps = N max_time = S else: assert False if cols is None: cols = () else: cols = [state.outputs.index(i) for i in cols] if rowids is None: rowids = () M_c = _crosscat_M_c(state) T = _crosscat_T(state, M_c) X_D = _crosscat_X_D(state, M_c) X_L = _crosscat_X_L(state, M_c, X_D) from crosscat.LocalEngine import LocalEngine LE = LocalEngine(seed=seed) if checkpoint is None: X_L_new, X_D_new = LE.analyze( M_c, T, X_L, X_D, seed, kernel_list=kernels, n_steps=n_steps, max_time=max_time, c=cols, r=rowids, progress=progress) diagnostics_new = dict() else: X_L_new, X_D_new, diagnostics_new = LE.analyze( M_c, T, X_L, X_D, seed, kernel_list=kernels, n_steps=n_steps, max_time=max_time, c=cols, r=rowids, do_diagnostics=True, diagnostics_every_N=checkpoint, progress=progress) _update_state(state, M_c, X_L_new, X_D_new) if diagnostics_new: _update_diagnostics(state, diagnostics_new)
def transition_cpp(crosscat, N=None, S=None, kernels=None, rowids=None, cols=None, Cd=None, Ci=None, seed=None, progress=None): """Runs full Gibbs sweeps of all kernels on the cgpm.state.State object. Permissible kernels: 'column_partition_hyperparameter' 'column_partition_assignments' 'column_hyperparameters' 'row_partition_hyperparameters' 'row_partition_assignments' """ if seed is None: seed = 1 if kernels is None: kernels = () if (progress is None) or progress: progress = _progress if N is None and S is None: n_steps = 1 max_time = -1 if N is not None and S is None: n_steps = N max_time = -1 elif S is not None and N is None: # This is a hack, lovecat has no way to specify just max_seconds. n_steps = 150000 max_time = S elif S is not None and N is not None: n_steps = N max_time = S else: assert False if cols is None: cols = () else: outputs = get_distribution_outputs(crosscat) outputs_mapping_inverse = {c:i for i,c in enumerate(outputs)} cols = [outputs_mapping_inverse[c] for c in cols] observations = get_crosscat_dataset(crosscat) if not observations: return crosscat if rowids is None: rowids = () else: rowids_mapping = observations_to_rowids_mapping(observations) rowids_mapping_inverse = {r:i for i, r in rowids_mapping.iteritems()} rowids = [rowids_mapping_inverse[r] for r in rowids] M_c = _get_crosscat_M_c(crosscat, observations) T = _get_crosscat_T(crosscat, M_c, observations) X_D = _get_crosscat_X_D(crosscat) X_L = _get_crosscat_X_L(crosscat, M_c, X_D, Cd, Ci) LE = LocalEngine(seed=seed) X_L_new, X_D_new = LE.analyze( M_c=M_c, T=T, X_L=X_L, X_D=X_D, seed=seed, kernel_list=kernels, n_steps=n_steps, max_time=max_time, c=cols, r=rowids, progress=progress, ) # XXX This reconstruction is wasteful: can find the diff in the trace # and apply those, but it is some work to get that right. return _get_crosscat_updated(crosscat, observations, M_c, X_L_new, X_D_new)