def chunk_analyze_helper(table_data, data_dict, command_dict): original_n_steps = command_dict['n_steps'] original_SEED = data_dict['SEED'] chunk_size = command_dict['chunk_size'] chunk_filename_prefix = command_dict['chunk_filename_prefix'] chunk_dest_dir = command_dict['chunk_dest_dir'] # steps_done = 0 while steps_done < original_n_steps: steps_remaining = original_n_steps - steps_done command_dict['n_steps'] = min(chunk_size, steps_remaining) ith_chunk = steps_done / chunk_size dict_out = analyze_helper(table_data, data_dict, command_dict) data_dict.update(dict_out) # write to hdfs chunk_filename = '%s_seed_%s_chunk_%s.pkl.gz' \ % (chunk_filename_prefix, original_SEED, ith_chunk) fu.pickle(dict_out, chunk_filename) hu.put_hdfs(None, chunk_filename, chunk_dest_dir) # steps_done += chunk_size chunk_filename = '%s_seed_%s_chunk_%s.pkl.gz' \ % (chunk_filename_prefix, original_SEED, 'FINAL') fu.pickle(dict_out, chunk_filename) hu.put_hdfs(None, chunk_filename, chunk_dest_dir) return dict_out
X_L_list, X_D_list = hadoop_output elif command == 'chunk_analyze': assert resume_filename is not None if fu.is_pkl(resume_filename): resume_dict = fu.unpickle(resume_filename) X_L_list = resume_dict['X_L_list'] X_D_list = resume_dict['X_D_list'] else: X_L_list, X_D_list = hu.read_hadoop_output(resume_filename) hadoop_output = he.analyze(M_c, T, X_L_list, X_D_list, n_steps=n_steps, max_time=max_time, chunk_size=chunk_size, chunk_filename_prefix=chunk_filename_prefix, chunk_dest_dir=chunk_dest_dir) if hadoop_output is not None: X_L_list, X_D_list = hadoop_output else: print 'Unknown command: %s' % command import sys sys.exit() if pkl_filename is not None: to_pkl_dict = dict( T=T, M_c=M_c, M_r=M_r, X_L_list=X_L_list, X_D_list=X_D_list, ) fu.pickle(to_pkl_dict, filename=pkl_filename)
# Hard code the parameter values for now parameter_list = [num_rows_list, num_cols_list, num_clusters_list, num_splits_list] # Iterate over the parameter values and write each run as a line in the hadoop_input file take_product_of = [num_rows_list, num_cols_list, num_clusters_list, num_splits_list] for num_rows, num_cols, num_clusters, num_splits \ in itertools.product(*take_product_of): if numpy.mod(num_rows, num_clusters) == 0 and numpy.mod(num_cols,num_splits)==0: timing_run_parameters = dict(num_rows=num_rows, num_cols=num_cols, num_views=num_splits, num_clusters=num_clusters) write_hadoop_input(input_filename, timing_run_parameters, n_steps, SEED=gen_seed) n_tasks = len(num_rows_list)*len(num_cols_list)*len(num_clusters_list)*len(num_splits_list)*5 # Create a dummy table data file table_data=dict(T=[],M_c=[],X_L=[],X_D=[]) fu.pickle(table_data, table_data_filename) if do_local: xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename) print('Local Engine for automated timing runs has not been completely implemented/tested') elif do_remote: hadoop_engine = HE.HadoopEngine(which_engine_binary=which_engine_binary, output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename) xu.write_support_files(table_data, hadoop_engine.table_data_filename, dict(command='time_analyze'), hadoop_engine.command_dict_filename) hadoop_engine.send_hadoop_command(n_tasks=n_tasks) was_successful = hadoop_engine.get_hadoop_results() if was_successful: hu.copy_hadoop_output(hadoop_engine.output_path, output_filename)
n_steps = min(block_size, num_transitions) print 'Analyzing ...' while (completed_transitions < num_transitions): # We won't be limiting by time in the convergence runs X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, kernel_list=(), n_steps=n_steps, max_time=-1) if truth_flag: tmp_ari_table, tmp_ari_views = ctu.multi_chain_ARI(X_L_list,X_D_list, view_assignment_truth, X_D_truth) ari_table.append(tmp_ari_table) ari_views.append(tmp_ari_views) else: # Not sure we want to save the models for convergence testing saved_dict = {'T':T, 'M_c':M_c, 'X_L_list':X_L_list, 'X_D_list': X_D_list} pkl_filename = 'model_{!s}.pkl.gz'.format(str(completed_transitions)) f_utils.pickle(saved_dict, filename = pkl_filename) completed_transitions = completed_transitions+block_size print completed_transitions # Always save the last model saved_dict = {'T':T, 'M_c':M_c, 'X_L_list':X_L_list, 'X_D_list': X_D_list} pkl_filename = 'model_{!s}.pkl.gz'.format('last') f_utils.pickle(saved_dict, filename = pkl_filename) if truth_flag: with open(ari_logfile, 'a') as outfile: csvwriter=csv.writer(outfile,delimiter=',') csvwriter.writerow([time.ctime(), num_transitions, block_size, max_rows, num_cols, num_views, num_clusters, ari_views, ari_table])
def read_and_pickle_table_data(table_data_filename, pkl_filename): T, M_r, M_c = du.read_model_data_from_csv(table_data_filename, gen_seed=0) table_data = dict(T=T, M_r=M_r, M_c=M_c) fu.pickle(table_data, pkl_filename) return table_data
def write_support_files(table_data, table_data_filename, command_dict, command_dict_filename): fu.pickle(table_data, table_data_filename) fu.pickle(command_dict, command_dict_filename) return
pkl_filename = args.pkl_filename initialize_input_filename = args.initialize_input_filename initialize_output_filename = args.initialize_output_filename analyze_input_filename = args.analyze_input_filename n_steps = args.n_steps n_chains = args.n_chains if do_what == 'read_and_pickle_table_data': read_and_pickle_table_data(table_filename, pkl_filename) elif do_what == 'write_initialization_files': write_initialization_files(initialize_input_filename, n_chains=n_chains) elif do_what == 'link_initialize_to_analyze': analyze_args_dict = default_analyze_args_dict.copy() analyze_args_dict['n_steps'] = n_steps link_initialize_to_analyze(initialize_output_filename, analyze_input_filename, analyze_args_dict) elif do_what == 'assert_vpn_is_connected': assert_vpn_is_connected() elif do_what == 'parse_hadoop_lines': assert hadoop_filename is not None parsed_lines = [] with open(hadoop_filename) as fh: for line in fh: parsed_lines.append(parse_hadoop_line(line)) print(len(parsed_lines)) if pkl_filename != default_table_data_filename: fu.pickle(parsed_lines, pkl_filename) else: print('unknown do_what: %s' % do_what)
dview.apply_sync(lambda: sys.path.append(path_append)) # dview.push(dict(M_c=M_c, M_r=M_r, T=T, num_transitions=num_transitions)) seeds = range(num_chains) async_result = dview.map_async(do_intialize, seeds) initialized_states = async_result.get() # async_result = dview.map_async(do_analyze, zip(seeds, initialized_states)) chain_tuples = async_result.get() # visualize the column cooccurence matrix X_L_list, X_D_list = map(list, zip(*chain_tuples)) # save the progress to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list) fu.pickle(to_pickle, pkl_filename) # to_pickle = fu.unpickle(pkl_filename) # X_L_list = to_pickle['X_L_list'] # X_D_list = to_pickle['X_D_list'] # can we recreate a row given some of its values? query_cols = [2, 6, 9] query_names = col_names[query_cols] Q = determine_Q(M_c, query_names, num_rows) # condition_cols = [3, 4, 10] condition_names = col_names[condition_cols] samples_list = [] engine = LE.LocalEngine(inf_seed) for actual_row_idx in [1, 10, 100]:
return Y # set everything up T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed) num_rows = len(T) num_cols = len(T[0]) col_names = numpy.array([M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)]) # initialze and transition chains engine = LE.LocalEngine(inf_seed) X_L_list, X_D_list = engine.initialize(M_c, M_r, T, get_next_seed(), initialization='from_the_prior', n_chains=num_chains) X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, get_next_seed(), n_steps=num_transitions) # save the progress to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list) fu.pickle(to_pickle, pkl_filename) # to_pickle = fu.unpickle(pkl_filename) # X_L_list = to_pickle['X_L_list'] # X_D_list = to_pickle['X_D_list'] engine = LE.LocalEngine(inf_seed) # can we recreate a row given some of its values? query_cols = [2, 6, 9] query_names = col_names[query_cols] Q = determine_Q(M_c, query_names, num_rows) # condition_cols = [3, 4, 10] condition_names = col_names[condition_cols] samples_list = [] for actual_row_idx in [1, 10, 100]:
if numpy.mod(num_rows, num_clusters) == 0 and numpy.mod( num_cols, num_splits) == 0: timing_run_parameters = dict(num_rows=num_rows, num_cols=num_cols, num_views=num_splits, num_clusters=num_clusters) write_hadoop_input(input_filename, timing_run_parameters, n_steps, SEED=gen_seed) n_tasks = len(num_rows_list) * len(num_cols_list) * len( num_clusters_list) * len(num_splits_list) * 5 # Create a dummy table data file table_data = dict(T=[], M_c=[], X_L=[], X_D=[]) fu.pickle(table_data, table_data_filename) if do_local: xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename) print( 'Local Engine for automated timing runs has not been completely implemented/tested' ) elif do_remote: hadoop_engine = HE.HadoopEngine( which_engine_binary=which_engine_binary, output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename) xu.write_support_files(table_data, hadoop_engine.table_data_filename, dict(command='time_analyze'),
hadoop_filename = args.hadoop_filename table_filename = args.table_filename pkl_filename = args.pkl_filename initialize_input_filename = args.initialize_input_filename initialize_output_filename = args.initialize_output_filename analyze_input_filename = args.analyze_input_filename n_steps = args.n_steps n_chains = args.n_chains if do_what == "read_and_pickle_table_data": read_and_pickle_table_data(table_filename, pkl_filename) elif do_what == "write_initialization_files": write_initialization_files(initialize_input_filename, n_chains=n_chains) elif do_what == "link_initialize_to_analyze": analyze_args_dict = default_analyze_args_dict.copy() analyze_args_dict["n_steps"] = n_steps link_initialize_to_analyze(initialize_output_filename, analyze_input_filename, analyze_args_dict) elif do_what == "assert_vpn_is_connected": assert_vpn_is_connected() elif do_what == "parse_hadoop_lines": assert hadoop_filename is not None parsed_lines = [] with open(hadoop_filename) as fh: for line in fh: parsed_lines.append(parse_hadoop_line(line)) print len(parsed_lines) if pkl_filename != default_table_data_filename: fu.pickle(parsed_lines, pkl_filename) else: print "uknown do_what: %s" % do_what