def chunk_analyze_helper(table_data, data_dict, command_dict):
    original_n_steps = command_dict['n_steps']
    original_SEED = data_dict['SEED']
    chunk_size = command_dict['chunk_size']
    chunk_filename_prefix = command_dict['chunk_filename_prefix']
    chunk_dest_dir = command_dict['chunk_dest_dir']
    #
    steps_done = 0
    while steps_done < original_n_steps:
        steps_remaining = original_n_steps - steps_done
        command_dict['n_steps'] = min(chunk_size, steps_remaining)
        ith_chunk = steps_done / chunk_size
        dict_out = analyze_helper(table_data, data_dict, command_dict)
        data_dict.update(dict_out)
        # write to hdfs
        chunk_filename = '%s_seed_%s_chunk_%s.pkl.gz' \
            % (chunk_filename_prefix, original_SEED, ith_chunk)
        fu.pickle(dict_out, chunk_filename)
        hu.put_hdfs(None, chunk_filename, chunk_dest_dir)
        #
        steps_done += chunk_size
    chunk_filename = '%s_seed_%s_chunk_%s.pkl.gz' \
        % (chunk_filename_prefix, original_SEED, 'FINAL')
    fu.pickle(dict_out, chunk_filename)
    hu.put_hdfs(None, chunk_filename, chunk_dest_dir)
    return dict_out
def chunk_analyze_helper(table_data, data_dict, command_dict):
    original_n_steps = command_dict['n_steps']
    original_SEED = data_dict['SEED']
    chunk_size = command_dict['chunk_size']
    chunk_filename_prefix = command_dict['chunk_filename_prefix']
    chunk_dest_dir = command_dict['chunk_dest_dir']
    #
    steps_done = 0
    while steps_done < original_n_steps:
        steps_remaining = original_n_steps - steps_done
        command_dict['n_steps'] = min(chunk_size, steps_remaining)
        ith_chunk = steps_done / chunk_size
        dict_out = analyze_helper(table_data, data_dict, command_dict)
        data_dict.update(dict_out)
        # write to hdfs
        chunk_filename = '%s_seed_%s_chunk_%s.pkl.gz' \
            % (chunk_filename_prefix, original_SEED, ith_chunk)
        fu.pickle(dict_out, chunk_filename)
        hu.put_hdfs(None, chunk_filename, chunk_dest_dir)
        #
        steps_done += chunk_size
    chunk_filename = '%s_seed_%s_chunk_%s.pkl.gz' \
        % (chunk_filename_prefix, original_SEED, 'FINAL')
    fu.pickle(dict_out, chunk_filename)
    hu.put_hdfs(None, chunk_filename, chunk_dest_dir)
    return dict_out
Exemple #3
0
            X_L_list, X_D_list = hadoop_output
    elif command == 'chunk_analyze':
        assert resume_filename is not None
        if fu.is_pkl(resume_filename):
          resume_dict = fu.unpickle(resume_filename)
          X_L_list = resume_dict['X_L_list']
          X_D_list = resume_dict['X_D_list']
        else:
          X_L_list, X_D_list = hu.read_hadoop_output(resume_filename)
        hadoop_output = he.analyze(M_c, T, X_L_list, X_D_list,
                                   n_steps=n_steps, max_time=max_time,
                                   chunk_size=chunk_size,
                                   chunk_filename_prefix=chunk_filename_prefix,
                                   chunk_dest_dir=chunk_dest_dir)
        if hadoop_output is not None:
            X_L_list, X_D_list = hadoop_output
    else:
        print 'Unknown command: %s' % command
        import sys
        sys.exit()
        
    if pkl_filename is not None:
      to_pkl_dict = dict(
            T=T,
            M_c=M_c,
            M_r=M_r,
            X_L_list=X_L_list,
            X_D_list=X_D_list,
            )
      fu.pickle(to_pkl_dict, filename=pkl_filename)
    # Hard code the parameter values for now

    parameter_list = [num_rows_list, num_cols_list, num_clusters_list, num_splits_list]

    # Iterate over the parameter values and write each run as a line in the hadoop_input file
    take_product_of = [num_rows_list, num_cols_list, num_clusters_list, num_splits_list]
    for num_rows, num_cols, num_clusters, num_splits \
            in itertools.product(*take_product_of):
        if numpy.mod(num_rows, num_clusters) == 0 and numpy.mod(num_cols,num_splits)==0:
          timing_run_parameters = dict(num_rows=num_rows, num_cols=num_cols, num_views=num_splits, num_clusters=num_clusters)
          write_hadoop_input(input_filename, timing_run_parameters,  n_steps, SEED=gen_seed)

    n_tasks = len(num_rows_list)*len(num_cols_list)*len(num_clusters_list)*len(num_splits_list)*5
    # Create a dummy table data file
    table_data=dict(T=[],M_c=[],X_L=[],X_D=[])
    fu.pickle(table_data, table_data_filename)

    if do_local:
        xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename)
        print('Local Engine for automated timing runs has not been completely implemented/tested')
    elif do_remote:
        hadoop_engine = HE.HadoopEngine(which_engine_binary=which_engine_binary,
                output_path=output_path,
                input_filename=input_filename,
                table_data_filename=table_data_filename)
        xu.write_support_files(table_data, hadoop_engine.table_data_filename,
                              dict(command='time_analyze'), hadoop_engine.command_dict_filename)
        hadoop_engine.send_hadoop_command(n_tasks=n_tasks)
        was_successful = hadoop_engine.get_hadoop_results()
        if was_successful:
            hu.copy_hadoop_output(hadoop_engine.output_path, output_filename)
Exemple #5
0
n_steps = min(block_size, num_transitions)
print 'Analyzing ...'
while (completed_transitions < num_transitions):
    # We won't be limiting by time in the convergence runs
    X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, kernel_list=(),
                                        n_steps=n_steps, max_time=-1)
    
    if truth_flag:
        tmp_ari_table, tmp_ari_views = ctu.multi_chain_ARI(X_L_list,X_D_list, view_assignment_truth, X_D_truth)
        ari_table.append(tmp_ari_table)
        ari_views.append(tmp_ari_views)
        
    else:
        # Not sure we want to save the models for convergence testing 
        saved_dict = {'T':T, 'M_c':M_c, 'X_L_list':X_L_list, 'X_D_list': X_D_list}
        pkl_filename = 'model_{!s}.pkl.gz'.format(str(completed_transitions))
        f_utils.pickle(saved_dict, filename = pkl_filename)

    completed_transitions = completed_transitions+block_size
    print completed_transitions
    
# Always save the last model
saved_dict = {'T':T, 'M_c':M_c, 'X_L_list':X_L_list, 'X_D_list': X_D_list}
pkl_filename = 'model_{!s}.pkl.gz'.format('last')
f_utils.pickle(saved_dict, filename = pkl_filename)

if truth_flag:
    with open(ari_logfile, 'a') as outfile:
        csvwriter=csv.writer(outfile,delimiter=',')
        csvwriter.writerow([time.ctime(), num_transitions, block_size, max_rows, num_cols, num_views, num_clusters, ari_views, ari_table])
Exemple #6
0
def read_and_pickle_table_data(table_data_filename, pkl_filename):
    T, M_r, M_c = du.read_model_data_from_csv(table_data_filename, gen_seed=0)
    table_data = dict(T=T, M_r=M_r, M_c=M_c)
    fu.pickle(table_data, pkl_filename)
    return table_data
Exemple #7
0
def write_support_files(table_data, table_data_filename, command_dict,
                        command_dict_filename):
    fu.pickle(table_data, table_data_filename)
    fu.pickle(command_dict, command_dict_filename)
    return
Exemple #8
0
    pkl_filename = args.pkl_filename
    initialize_input_filename = args.initialize_input_filename
    initialize_output_filename = args.initialize_output_filename
    analyze_input_filename = args.analyze_input_filename
    n_steps = args.n_steps
    n_chains = args.n_chains

    if do_what == 'read_and_pickle_table_data':
        read_and_pickle_table_data(table_filename, pkl_filename)
    elif do_what == 'write_initialization_files':
        write_initialization_files(initialize_input_filename,
                                   n_chains=n_chains)
    elif do_what == 'link_initialize_to_analyze':
        analyze_args_dict = default_analyze_args_dict.copy()
        analyze_args_dict['n_steps'] = n_steps
        link_initialize_to_analyze(initialize_output_filename,
                                   analyze_input_filename, analyze_args_dict)
    elif do_what == 'assert_vpn_is_connected':
        assert_vpn_is_connected()
    elif do_what == 'parse_hadoop_lines':
        assert hadoop_filename is not None
        parsed_lines = []
        with open(hadoop_filename) as fh:
            for line in fh:
                parsed_lines.append(parse_hadoop_line(line))
                print(len(parsed_lines))
        if pkl_filename != default_table_data_filename:
            fu.pickle(parsed_lines, pkl_filename)
    else:
        print('unknown do_what: %s' % do_what)
    dview.apply_sync(lambda: sys.path.append(path_append))
#
dview.push(dict(M_c=M_c, M_r=M_r, T=T, num_transitions=num_transitions))
seeds = range(num_chains)
async_result = dview.map_async(do_intialize, seeds)
initialized_states = async_result.get()
#
async_result = dview.map_async(do_analyze, zip(seeds, initialized_states))
chain_tuples = async_result.get()

# visualize the column cooccurence matrix
X_L_list, X_D_list = map(list, zip(*chain_tuples))

# save the progress
to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list)
fu.pickle(to_pickle, pkl_filename)

# to_pickle = fu.unpickle(pkl_filename)
# X_L_list = to_pickle['X_L_list']
# X_D_list = to_pickle['X_D_list']

# can we recreate a row given some of its values?
query_cols = [2, 6, 9]
query_names = col_names[query_cols]
Q = determine_Q(M_c, query_names, num_rows)
#
condition_cols = [3, 4, 10]
condition_names = col_names[condition_cols]
samples_list = []
engine = LE.LocalEngine(inf_seed)
for actual_row_idx in [1, 10, 100]:
Exemple #10
0
    return Y

# set everything up
T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed)
num_rows = len(T)
num_cols = len(T[0])
col_names = numpy.array([M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)])

# initialze and transition chains
engine = LE.LocalEngine(inf_seed)
X_L_list, X_D_list = engine.initialize(M_c, M_r, T, get_next_seed(), initialization='from_the_prior', n_chains=num_chains)
X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, get_next_seed(), n_steps=num_transitions)

# save the progress
to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list)
fu.pickle(to_pickle, pkl_filename)

# to_pickle = fu.unpickle(pkl_filename)
# X_L_list = to_pickle['X_L_list']
# X_D_list = to_pickle['X_D_list']

engine = LE.LocalEngine(inf_seed)
# can we recreate a row given some of its values?
query_cols = [2, 6, 9]
query_names = col_names[query_cols]
Q = determine_Q(M_c, query_names, num_rows)
#
condition_cols = [3, 4, 10]
condition_names = col_names[condition_cols]
samples_list = []
for actual_row_idx in [1, 10, 100]:
        if numpy.mod(num_rows, num_clusters) == 0 and numpy.mod(
                num_cols, num_splits) == 0:
            timing_run_parameters = dict(num_rows=num_rows,
                                         num_cols=num_cols,
                                         num_views=num_splits,
                                         num_clusters=num_clusters)
            write_hadoop_input(input_filename,
                               timing_run_parameters,
                               n_steps,
                               SEED=gen_seed)

    n_tasks = len(num_rows_list) * len(num_cols_list) * len(
        num_clusters_list) * len(num_splits_list) * 5
    # Create a dummy table data file
    table_data = dict(T=[], M_c=[], X_L=[], X_D=[])
    fu.pickle(table_data, table_data_filename)

    if do_local:
        xu.run_script_local(input_filename, script_filename, output_filename,
                            table_data_filename)
        print(
            'Local Engine for automated timing runs has not been completely implemented/tested'
        )
    elif do_remote:
        hadoop_engine = HE.HadoopEngine(
            which_engine_binary=which_engine_binary,
            output_path=output_path,
            input_filename=input_filename,
            table_data_filename=table_data_filename)
        xu.write_support_files(table_data, hadoop_engine.table_data_filename,
                               dict(command='time_analyze'),
n_steps = min(block_size, num_transitions)
print 'Analyzing ...'
while (completed_transitions < num_transitions):
    # We won't be limiting by time in the convergence runs
    X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, kernel_list=(),
                                        n_steps=n_steps, max_time=-1)
    
    if truth_flag:
        tmp_ari_table, tmp_ari_views = ctu.multi_chain_ARI(X_L_list,X_D_list, view_assignment_truth, X_D_truth)
        ari_table.append(tmp_ari_table)
        ari_views.append(tmp_ari_views)
        
    else:
        # Not sure we want to save the models for convergence testing 
        saved_dict = {'T':T, 'M_c':M_c, 'X_L_list':X_L_list, 'X_D_list': X_D_list}
        pkl_filename = 'model_{!s}.pkl.gz'.format(str(completed_transitions))
        f_utils.pickle(saved_dict, filename = pkl_filename)

    completed_transitions = completed_transitions+block_size
    print completed_transitions
    
# Always save the last model
saved_dict = {'T':T, 'M_c':M_c, 'X_L_list':X_L_list, 'X_D_list': X_D_list}
pkl_filename = 'model_{!s}.pkl.gz'.format('last')
f_utils.pickle(saved_dict, filename = pkl_filename)

if truth_flag:
    with open(ari_logfile, 'a') as outfile:
        csvwriter=csv.writer(outfile,delimiter=',')
        csvwriter.writerow([time.ctime(), num_transitions, block_size, max_rows, num_cols, num_views, num_clusters, ari_views, ari_table])
Exemple #13
0
def read_and_pickle_table_data(table_data_filename, pkl_filename):
    T, M_r, M_c = du.read_model_data_from_csv(table_data_filename, gen_seed=0)
    table_data = dict(T=T, M_r=M_r, M_c=M_c)
    fu.pickle(table_data, pkl_filename)
    return table_data
Exemple #14
0
def write_support_files(table_data, table_data_filename, command_dict, command_dict_filename):
    fu.pickle(table_data, table_data_filename)
    fu.pickle(command_dict, command_dict_filename)
    return
Exemple #15
0
    hadoop_filename = args.hadoop_filename
    table_filename = args.table_filename
    pkl_filename = args.pkl_filename
    initialize_input_filename = args.initialize_input_filename
    initialize_output_filename = args.initialize_output_filename
    analyze_input_filename = args.analyze_input_filename
    n_steps = args.n_steps
    n_chains = args.n_chains

    if do_what == "read_and_pickle_table_data":
        read_and_pickle_table_data(table_filename, pkl_filename)
    elif do_what == "write_initialization_files":
        write_initialization_files(initialize_input_filename, n_chains=n_chains)
    elif do_what == "link_initialize_to_analyze":
        analyze_args_dict = default_analyze_args_dict.copy()
        analyze_args_dict["n_steps"] = n_steps
        link_initialize_to_analyze(initialize_output_filename, analyze_input_filename, analyze_args_dict)
    elif do_what == "assert_vpn_is_connected":
        assert_vpn_is_connected()
    elif do_what == "parse_hadoop_lines":
        assert hadoop_filename is not None
        parsed_lines = []
        with open(hadoop_filename) as fh:
            for line in fh:
                parsed_lines.append(parse_hadoop_line(line))
                print len(parsed_lines)
        if pkl_filename != default_table_data_filename:
            fu.pickle(parsed_lines, pkl_filename)
    else:
        print "uknown do_what: %s" % do_what