Example #1
0
gen_seed = args.gen_seed
num_transitions = args.num_transitions
N_GRID = args.N_GRID
max_rows = args.max_rows
num_clusters = args.num_clusters
num_views = args.num_views
num_cols = args.num_cols
numChains = args.numChains
block_size = args.block_size


engine = ccc.get_CrossCatClient('hadoop', seed = inf_seed)

if filename is not None:
    # Load the data from table and sub-sample entities to max_rows
    T, M_r, M_c = du.read_model_data_from_csv(filename, max_rows, gen_seed)
    truth_flag = 0
else:
    T, M_r, M_c, data_inverse_permutation_indices = \
        du.gen_factorial_data_objects(gen_seed, num_clusters,
                                      num_cols, max_rows, num_views,
                                      max_mean=100, max_std=1,
                                      send_data_inverse_permutation_indices=True)
    view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(data_inverse_permutation_indices, max_rows,num_cols,num_views, num_clusters)
    truth_flag = 1

        
num_rows = len(T)
num_cols = len(T[0])

ari_table = []
Example #2
0
    chunk_dest_dir = args.chunk_dest_dir
    max_time = args.max_time
    table_filename = args.table_filename
    resume_filename = args.resume_filename
    pkl_filename = args.pkl_filename
    #
    command = args.command
    # assert command in set(gu.get_method_names(HadoopEngine))
    #
    cctypes_filename = args.cctypes_filename
    cctypes = None
    if cctypes_filename is not None:
      cctypes = fu.unpickle(cctypes_filename)

    hdfs_uri, jobtracker_uri = hu.get_uris(base_uri, hdfs_uri, jobtracker_uri)
    T, M_r, M_c = du.read_model_data_from_csv(table_filename, gen_seed=0,
                                              cctypes=cctypes)
    he = HadoopEngine(which_engine_binary=which_engine_binary,
		      which_hadoop_binary=which_hadoop_binary,
		      which_hadoop_jar=which_hadoop_jar,
                      hdfs_dir=hdfs_dir, hdfs_uri=hdfs_uri,
                      jobtracker_uri=jobtracker_uri)
    
    X_L_list, X_D_list = None, None
    if command == 'initialize':
        hadoop_output = he.initialize(M_c, M_r, T,
                                      initialization='from_the_prior',
                                      n_chains=n_chains)
        if hadoop_output is not None:
            X_L_list, X_D_list = hadoop_output
    elif command == 'analyze':
        assert resume_filename is not None
Example #3
0
def read_and_pickle_table_data(table_data_filename, pkl_filename):
    T, M_r, M_c = du.read_model_data_from_csv(table_data_filename, gen_seed=0)
    table_data = dict(T=T, M_r=M_r, M_c=M_c)
    fu.pickle(table_data, pkl_filename)
    return table_data

def do_intialize(SEED):
    _do_initialize = crosscat.LocalEngine._do_initialize
    return _do_initialize(M_c, M_r, T, 'from_the_prior', SEED)


def do_analyze((SEED, state_tuple)):
    X_L, X_D = state_tuple
    _do_analyze = crosscat.LocalEngine._do_analyze
    return _do_analyze(M_c, T, X_L, X_D, (), num_transitions, (), (), -1, -1,
                       SEED)


# set everything up
T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed)
num_rows = len(T)
num_cols = len(T[0])
col_names = numpy.array(
    [M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)])

## set up parallel
from IPython.parallel import Client
c = Client(ipython_parallel_config)
dview = c[:]
with dview.sync_imports():
    import crosscat
    import crosscat.LocalEngine
    import sys
if path_append is not None:
    dview.apply_sync(lambda: sys.path.append(path_append))
Example #5
0
    Q = [(row_idx, col_idx) for col_idx in query_col_indices]
    return Q

def determine_unobserved_Y(num_rows, M_c, condition_tuples):
    name_to_idx = M_c['name_to_idx']
    row_idx = num_rows + 1
    Y = []
    for col_name, col_value in condition_tuples:
        col_idx = name_to_idx[col_name]
        col_code = du.convert_value_to_code(M_c, col_idx, col_value)
        y = (row_idx, col_idx, col_code)
        Y.append(y)
    return Y

# set everything up
T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed)
num_rows = len(T)
num_cols = len(T[0])
col_names = numpy.array([M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)])

# initialze and transition chains
engine = LE.LocalEngine(inf_seed)
X_L_list, X_D_list = engine.initialize(M_c, M_r, T, get_next_seed(), initialization='from_the_prior', n_chains=num_chains)
X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, get_next_seed(), n_steps=num_transitions)

# save the progress
to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list)
fu.pickle(to_pickle, pkl_filename)

# to_pickle = fu.unpickle(pkl_filename)
# X_L_list = to_pickle['X_L_list']
gen_seed = args.gen_seed
num_transitions = args.num_transitions
N_GRID = args.N_GRID
max_rows = args.max_rows
num_clusters = args.num_clusters
num_views = args.num_views
num_cols = args.num_cols
numChains = args.numChains
block_size = args.block_size


engine = ccc.get_CrossCatClient('hadoop', seed = inf_seed)

if filename is not None:
    # Load the data from table and sub-sample entities to max_rows
    T, M_r, M_c = du.read_model_data_from_csv(filename, max_rows, gen_seed)
    truth_flag = 0
else:
    T, M_r, M_c, data_inverse_permutation_indices = \
        du.gen_factorial_data_objects(gen_seed, num_clusters,
                                      num_cols, max_rows, num_views,
                                      max_mean=100, max_std=1,
                                      send_data_inverse_permutation_indices=True)
    view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(data_inverse_permutation_indices, max_rows,num_cols,num_views, num_clusters)
    truth_flag = 1

        
num_rows = len(T)
num_cols = len(T[0])

ari_table = []
Example #7
0
def read_and_pickle_table_data(table_data_filename, pkl_filename):
    T, M_r, M_c = du.read_model_data_from_csv(table_data_filename, gen_seed=0)
    table_data = dict(T=T, M_r=M_r, M_c=M_c)
    fu.pickle(table_data, pkl_filename)
    return table_data