Example #1
0
def quick_le(seed, n_chains=1):
    # Specify synthetic dataset structure.
    cctypes = [
        'continuous', 'continuous', 'multinomial', 'multinomial', 'continuous'
    ]
    distargs = [None, None, dict(K=9), dict(K=7), None]
    cols_to_views = [0, 0, 0, 1, 1]
    separation = [0.6, 0.9]
    cluster_weights = [[.2, .3, .5], [.9, .1]]

    # Obtain the generated dataset and metadata/
    T, M_c, M_r = sdg.gen_data(cctypes,
                               N_ROWS,
                               cols_to_views,
                               cluster_weights,
                               separation,
                               seed=seed,
                               distargs=distargs,
                               return_structure=True)

    # Create, initialize, and analyze the engine.
    engine = LocalEngine()
    X_L, X_D = engine.initialize(M_c, M_r, T, seed, n_chains=n_chains)

    return T, M_r, M_c, X_L, X_D, engine
Example #2
0
def runner(config):
    generate_args, analyze_args, inf_seed = _munge_config(config)
    # generate synthetic data
    T, M_c, M_r, X_L, X_D = du.generate_clean_state(max_mean=10, max_std=1,
            **generate_args)
    table_shape = map(len, (T, T[0]))
    start_dims = du.get_state_shape(X_L)
    # run engine with do_timing = True
    engine = LocalEngine(inf_seed)
    X_L, X_D, (elapsed_secs,) = engine.analyze(M_c, T, X_L, X_D,
            do_timing=True,
            **analyze_args
            )
    #
    end_dims = du.get_state_shape(X_L)
    same_shape = start_dims == end_dims
    summary = dict(
        elapsed_secs=elapsed_secs,
        same_shape=same_shape,
        )
    ret_dict = dict(
        config=config,
        summary=summary,
        table_shape=table_shape,
        start_dims=start_dims,
        end_dims=end_dims,
        )
    return ret_dict
Example #3
0
def get_generative_clustering(M_c, M_r, T,
                              data_inverse_permutation_indices,
                              num_clusters, num_views):
    from crosscat.LocalEngine import LocalEngine
    import crosscat.cython_code.State as State
    # NOTE: this function only works because State.p_State doesn't use
    #       column_component_suffstats
    num_rows = len(T)
    num_cols = len(T[0])
    X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters))
    gen_X_D = [
        X_D_helper[numpy.argsort(data_inverse_permutation_index)]
        for data_inverse_permutation_index in data_inverse_permutation_indices
        ]
    gen_X_L_assignments = numpy.repeat(range(num_views), (num_cols / num_views))
    # initialize to generate an X_L to manipulate
    local_engine = LocalEngine()
    bad_X_L, bad_X_D = local_engine.initialize(M_c, M_r, T,
                                                         initialization='apart')
    bad_X_L['column_partition']['assignments'] = gen_X_L_assignments
    # manually constrcut state in in generative configuration
    state = State.p_State(M_c, T, bad_X_L, gen_X_D)
    gen_X_L = state.get_X_L()
    gen_X_D = state.get_X_D()
    # run inference on hyperparameters to leave them in a reasonable state
    kernel_list = (
        'row_partition_hyperparameters',
        'column_hyperparameters',
        'column_partition_hyperparameter',
        )
    gen_X_L, gen_X_D = local_engine.analyze(M_c, T, gen_X_L, gen_X_D, n_steps=1,
                                            kernel_list=kernel_list)
    #
    return gen_X_L, gen_X_D
Example #4
0
def get_generative_clustering(M_c, M_r, T,
                              data_inverse_permutation_indices,
                              num_clusters, num_views):
    from crosscat.LocalEngine import LocalEngine
    import crosscat.cython_code.State as State
    # NOTE: this function only works because State.p_State doesn't use
    #       column_component_suffstats
    num_rows = len(T)
    num_cols = len(T[0])
    X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters))
    gen_X_D = [
        X_D_helper[numpy.argsort(data_inverse_permutation_index)]
        for data_inverse_permutation_index in data_inverse_permutation_indices
        ]
    gen_X_L_assignments = numpy.repeat(range(num_views), (num_cols / num_views))
    # initialize to generate an X_L to manipulate
    local_engine = LocalEngine()
    bad_X_L, bad_X_D = local_engine.initialize(M_c, M_r, T,
                                                         initialization='apart')
    bad_X_L['column_partition']['assignments'] = gen_X_L_assignments
    # manually constrcut state in in generative configuration
    state = State.p_State(M_c, T, bad_X_L, gen_X_D)
    gen_X_L = state.get_X_L()
    gen_X_D = state.get_X_D()
    # run inference on hyperparameters to leave them in a reasonable state
    kernel_list = (
        'row_partition_hyperparameters',
        'column_hyperparameters',
        'column_partition_hyperparameter',
        )
    gen_X_L, gen_X_D = local_engine.analyze(M_c, T, gen_X_L, gen_X_D, n_steps=1,
                                            kernel_list=kernel_list)
    #
    return gen_X_L, gen_X_D
Example #5
0
def runner(config):
    # helpers
    def munge_config(config):
        kwargs = config.copy()
        kwargs['num_splits'] = kwargs.pop('num_views')
        n_steps = kwargs.pop('n_steps')
        n_test = kwargs.pop('n_test')
        return kwargs, n_steps, n_test

    def calc_ll(T, p_State):
        log_likelihoods = map(p_State.calc_row_predictive_logp, T)
        mean_log_likelihood = numpy.mean(log_likelihoods)
        return mean_log_likelihood

    def gen_data(**kwargs):
        T, M_c, M_r, gen_X_L, gen_X_D = du.generate_clean_state(**kwargs)
        #
        engine = LocalEngine()
        sampled_T = gu.sample_T(engine, M_c, T, gen_X_L, gen_X_D)
        T_test = random.sample(sampled_T, n_test)
        gen_data_ll = ctu.calc_mean_test_log_likelihood(
            M_c, T, gen_X_L, gen_X_D, T)
        gen_test_set_ll = ctu.calc_mean_test_log_likelihood(
            M_c, T, gen_X_L, gen_X_D, T_test)
        #
        return T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll

    kwargs, n_steps, n_test = munge_config(config)
    T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll = gen_data(**kwargs)
    # set up to run inference
    calc_data_ll = partial(calc_ll, T)
    calc_test_set_ll = partial(calc_ll, T_test)
    diagnostic_func_dict = dict(
        data_ll=calc_data_ll,
        test_set_ll=calc_test_set_ll,
    )
    # run inference
    engine = LocalEngine()
    X_L, X_D = engine.initialize(M_c, M_r, T)
    X_L, X_D, diagnostics_dict = engine.analyze(
        M_c, T, X_L, X_D, do_diagnostics=diagnostic_func_dict, n_steps=n_steps)
    # package result
    final_data_ll = diagnostics_dict['data_ll'][-1][-1]
    final_test_set_ll = diagnostics_dict['test_set_ll'][-1][-1]
    summary = dict(
        gen_data_ll=gen_data_ll,
        gen_test_set_ll=gen_test_set_ll,
        final_data_ll=final_data_ll,
        final_test_set_ll=final_test_set_ll,
    )

    result = dict(
        config=config,
        summary=summary,
        diagnostics_dict=diagnostics_dict,
    )
    return result
def runner(config):
    # helpers
    def munge_config(config):
        kwargs = config.copy()
        kwargs['num_splits'] = kwargs.pop('num_views')
        n_steps = kwargs.pop('n_steps')
        n_test = kwargs.pop('n_test')
        return kwargs, n_steps, n_test
    def calc_ll(T, p_State):
        log_likelihoods = map(p_State.calc_row_predictive_logp, T)
        mean_log_likelihood = numpy.mean(log_likelihoods)
        return mean_log_likelihood
    def gen_data(**kwargs):
        T, M_c, M_r, gen_X_L, gen_X_D = du.generate_clean_state(**kwargs)
        #
        engine = LocalEngine()
        sampled_T = gu.sample_T(engine, M_c, T, gen_X_L, gen_X_D)
        T_test = random.sample(sampled_T, n_test)
        gen_data_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T)
        gen_test_set_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T_test)
        #
        return T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll
    kwargs, n_steps, n_test = munge_config(config)
    T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll = gen_data(**kwargs)
    # set up to run inference
    calc_data_ll = partial(calc_ll, T)
    calc_test_set_ll = partial(calc_ll, T_test)
    diagnostic_func_dict = dict(
            data_ll=calc_data_ll,
            test_set_ll=calc_test_set_ll,
            )
    # run inference
    engine = LocalEngine()
    X_L, X_D = engine.initialize(M_c, M_r, T)
    X_L, X_D, diagnostics_dict = engine.analyze(M_c, T, X_L, X_D,
            do_diagnostics=diagnostic_func_dict, n_steps=n_steps)
    # package result
    final_data_ll = diagnostics_dict['data_ll'][-1][-1]
    final_test_set_ll = diagnostics_dict['test_set_ll'][-1][-1]
    summary = dict(
            gen_data_ll=gen_data_ll,
            gen_test_set_ll=gen_test_set_ll,
            final_data_ll=final_data_ll,
            final_test_set_ll=final_test_set_ll,
            )

    result = dict(
            config=config,
            summary=summary,
            diagnostics_dict=diagnostics_dict,
            )
    return result
Example #7
0
def quick_le(seed, n_chains=1):
    # Specify synthetic dataset structure.
    cctypes = ['continuous', 'continuous', 'multinomial', 'multinomial',
        'continuous']
    distargs = [None, None, dict(K=9), dict(K=7), None]
    cols_to_views = [0, 0, 0, 1, 1]
    separation = [0.6, 0.9]
    cluster_weights = [[.2, .3, .5],[.9, .1]]

    # Obtain the generated dataset and metadata/
    T, M_c, M_r = sdg.gen_data(cctypes, N_ROWS, cols_to_views, cluster_weights,
        separation, seed=seed, distargs=distargs, return_structure=True)

    # Create, initialize, and analyze the engine.
    engine = LocalEngine(seed=seed)
    X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=n_chains)

    return T, M_r, M_c, X_L, X_D, engine
Example #8
0
 def gen_data(**kwargs):
     T, M_c, M_r, gen_X_L, gen_X_D = du.generate_clean_state(**kwargs)
     #
     engine = LocalEngine()
     sampled_T = gu.sample_T(engine, M_c, T, gen_X_L, gen_X_D)
     T_test = random.sample(sampled_T, n_test)
     gen_data_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T)
     gen_test_set_ll = ctu.calc_mean_test_log_likelihood(M_c, T, gen_X_L, gen_X_D, T_test)
     #
     return T, M_c, M_r, T_test, gen_data_ll, gen_test_set_ll
Example #9
0
from crosscat.LocalEngine import LocalEngine
import crosscat.utils.data_utils as data_utils


data_filename = 'T.csv'
inference_seed = 0
num_full_transitions = 10

# read the data table into internal json representation
data_table, row_metadata, column_metadata, header = \
        data_utils.read_data_objects(data_filename)

# create an engine to run analysis, inference
engine = LocalEngine(seed=inference_seed)

# initialize markov chain samples
initial_latent_state, initial_latent_state_clustering = \
        engine.initialize(column_metadata, row_metadata, data_table)

# run markov chain transition kernels on samples
latent_state, latent_state_clustering = engine.analyze(column_metadata,
        data_table, initial_latent_state, initial_latent_state_clustering,
        n_steps=num_full_transitions)

from crosscat.LocalEngine import LocalEngine
import crosscat.utils.data_utils as data_utils


data_filename = "T.csv"
inference_seed = 0
num_full_transitions = 10

# read the data table into internal json representation
data_table, row_metadata, column_metadata, header = data_utils.read_data_objects(data_filename)

# create an engine to run analysis, inference
engine = LocalEngine(seed=inference_seed)

# initialize markov chain samples
initial_latent_state, initial_latent_state_clustering = engine.initialize(column_metadata, row_metadata, data_table)

# run markov chain transition kernels on samples
latent_state, latent_state_clustering = engine.analyze(
    column_metadata, data_table, initial_latent_state, initial_latent_state_clustering, n_steps=num_full_transitions
)
Example #11
0
def transition(
        state, N=None, S=None, kernels=None, rowids=None, cols=None,
        seed=None, checkpoint=None, progress=None):
    """Runs full Gibbs sweeps of all kernels on the cgpm.state.State object.

    Permittable kernels:
       'column_partition_hyperparameter'
       'column_partition_assignments'
       'column_hyperparameters'
       'row_partition_hyperparameters'
       'row_partition_assignments'
    """

    if seed is None:
        seed = 1
    if kernels is None:
        kernels = ()
    if (progress is None) or progress:
        progress = _progress

    if N is None and S is None:
        n_steps = 1
        max_time = -1
    if N is not None and S is None:
        n_steps = N
        max_time = -1
    elif S is not None and N is None:
        # This is a hack, lovecat has no way to specify just max_seconds.
        n_steps = 150000
        max_time = S
    elif S is not None and N is not None:
        n_steps = N
        max_time = S
    else:
        assert False

    if cols is None:
        cols = ()
    else:
        cols = [state.outputs.index(i) for i in cols]
    if rowids is None:
        rowids = ()

    M_c = _crosscat_M_c(state)
    T = _crosscat_T(state, M_c)
    X_D = _crosscat_X_D(state, M_c)
    X_L = _crosscat_X_L(state, M_c, X_D)

    from crosscat.LocalEngine import LocalEngine
    LE = LocalEngine(seed=seed)

    if checkpoint is None:
        X_L_new, X_D_new = LE.analyze(
            M_c, T, X_L, X_D, seed,
            kernel_list=kernels, n_steps=n_steps, max_time=max_time,
            c=cols, r=rowids, progress=progress)
        diagnostics_new = dict()
    else:
        X_L_new, X_D_new, diagnostics_new = LE.analyze(
            M_c, T, X_L, X_D, seed,
            kernel_list=kernels, n_steps=n_steps, max_time=max_time,
            c=cols, r=rowids, do_diagnostics=True,
            diagnostics_every_N=checkpoint, progress=progress)

    _update_state(state, M_c, X_L_new, X_D_new)

    if diagnostics_new:
        _update_diagnostics(state, diagnostics_new)
def transition_cpp(crosscat, N=None, S=None, kernels=None, rowids=None,
        cols=None, Cd=None, Ci=None, seed=None, progress=None):
    """Runs full Gibbs sweeps of all kernels on the cgpm.state.State object.

    Permissible kernels:
       'column_partition_hyperparameter'
       'column_partition_assignments'
       'column_hyperparameters'
       'row_partition_hyperparameters'
       'row_partition_assignments'
    """
    if seed is None:
        seed = 1
    if kernels is None:
        kernels = ()
    if (progress is None) or progress:
        progress = _progress

    if N is None and S is None:
        n_steps = 1
        max_time = -1
    if N is not None and S is None:
        n_steps = N
        max_time = -1
    elif S is not None and N is None:
        # This is a hack, lovecat has no way to specify just max_seconds.
        n_steps = 150000
        max_time = S
    elif S is not None and N is not None:
        n_steps = N
        max_time = S
    else:
        assert False

    if cols is None:
        cols = ()
    else:
        outputs = get_distribution_outputs(crosscat)
        outputs_mapping_inverse = {c:i for i,c in enumerate(outputs)}
        cols = [outputs_mapping_inverse[c] for c in cols]

    observations = get_crosscat_dataset(crosscat)
    if not observations:
        return crosscat

    if rowids is None:
        rowids = ()
    else:
        rowids_mapping = observations_to_rowids_mapping(observations)
        rowids_mapping_inverse = {r:i for i, r in rowids_mapping.iteritems()}
        rowids = [rowids_mapping_inverse[r] for r in rowids]

    M_c = _get_crosscat_M_c(crosscat, observations)
    T = _get_crosscat_T(crosscat, M_c, observations)
    X_D = _get_crosscat_X_D(crosscat)
    X_L = _get_crosscat_X_L(crosscat, M_c, X_D, Cd, Ci)
    LE = LocalEngine(seed=seed)
    X_L_new, X_D_new = LE.analyze(
        M_c=M_c,
        T=T,
        X_L=X_L,
        X_D=X_D,
        seed=seed,
        kernel_list=kernels,
        n_steps=n_steps,
        max_time=max_time,
        c=cols,
        r=rowids,
        progress=progress,
    )
    # XXX This reconstruction is wasteful: can find the diff in the trace
    # and apply those, but it is some work to get that right.
    return _get_crosscat_updated(crosscat, observations, M_c, X_L_new, X_D_new)