def save_current_figure(filename, dir="./", close=True, format=None): if filename is not None: fu.ensure_dir(dir) full_filename = os.path.join(dir, filename) pylab.savefig(full_filename, format=format) if close: pylab.close()
def send_hadoop_command(hdfs_uri, hdfs_dir, jobtracker_uri, which_engine_binary, which_hadoop_binary, which_hadoop_jar, input_filename, table_data_filename, command_dict_filename, output_path, n_tasks=1, one_map_task_per_line=True, task_timeout=60000000, DEBUG=False): # make sure output_path doesn't exist rm_hdfs(hdfs_uri, output_path, hdfs_base_dir=hdfs_dir) # send up input put_hdfs(hdfs_uri, input_filename, hdfs_base_dir=hdfs_dir) # actually send hadoop_cmd_str = create_hadoop_cmd_str(hdfs_uri, hdfs_dir, jobtracker_uri, which_engine_binary, which_hadoop_binary, which_hadoop_jar, input_filename, table_data_filename, command_dict_filename, output_path, n_tasks, one_map_task_per_line, task_timeout) was_successful = None if DEBUG: print(hadoop_cmd_str) return hadoop_cmd_str else: fu.ensure_dir(output_path) output_path_dotdot = os.path.split(output_path)[0] out_filename = os.path.join(output_path_dotdot, 'out') err_filename = os.path.join(output_path_dotdot, 'err') redirect_str = '>>%s 2>>%s' redirect_str %= (out_filename, err_filename) # could I nohup and check hdfs for presence of _SUCCESS every N seconds? # cmd_str = ' '.join(['nohup', hadoop_cmd_str, redirect_str, '&']) cmd_str = ' '.join([hadoop_cmd_str, redirect_str]) os.system(cmd_str) return
def save_current_figure(filename, dir='./', close=True, format=None): if filename is not None: fu.ensure_dir(dir) full_filename = os.path.join(dir, filename) pylab.savefig(full_filename, format=format) if close: pylab.close()
def savefig_legend_outside(filename, ax=None, bbox_inches="tight", dir="./"): if ax is None: ax = pylab.gca() lgd = ax.get_legend() fu.ensure_dir(dir) full_filename = os.path.join(dir, filename) pylab.savefig(full_filename, bbox_extra_artists=(lgd,), bbox_inches=bbox_inches) return
def savefig_legend_outside(filename, ax=None, bbox_inches='tight', dir='./'): if ax is None: ax = pylab.gca() lgd = ax.get_legend() fu.ensure_dir(dir) full_filename = os.path.join(dir, filename) pylab.savefig( full_filename, bbox_extra_artists=(lgd, ), bbox_inches=bbox_inches, ) return
def send_hadoop_command(hdfs_uri, hdfs_dir, jobtracker_uri, which_engine_binary, which_hadoop_binary, which_hadoop_jar, input_filename, table_data_filename, command_dict_filename, output_path, n_tasks=1, one_map_task_per_line=True, task_timeout=60000000, DEBUG=False): # make sure output_path doesn't exist rm_hdfs(hdfs_uri, output_path, hdfs_base_dir=hdfs_dir) # send up input put_hdfs(hdfs_uri, input_filename, hdfs_base_dir=hdfs_dir) # actually send hadoop_cmd_str = create_hadoop_cmd_str( hdfs_uri, hdfs_dir, jobtracker_uri, which_engine_binary, which_hadoop_binary, which_hadoop_jar, input_filename, table_data_filename, command_dict_filename, output_path, n_tasks, one_map_task_per_line, task_timeout) was_successful = None if DEBUG: print hadoop_cmd_str return hadoop_cmd_str else: fu.ensure_dir(output_path) output_path_dotdot = os.path.split(output_path)[0] out_filename = os.path.join(output_path_dotdot, 'out') err_filename = os.path.join(output_path_dotdot, 'err') redirect_str = '>>%s 2>>%s' redirect_str %= (out_filename, err_filename) # could I nohup and check hdfs for presence of _SUCCESS every N seconds? # cmd_str = ' '.join(['nohup', hadoop_cmd_str, redirect_str, '&']) cmd_str = ' '.join([hadoop_cmd_str, redirect_str]) os.system(cmd_str) return
num_clusters_list = args.num_clusters_list num_splits_list = args.num_splits_list which_engine_binary = args.which_engine_binary # print('using num_rows_list: %s' % num_rows_list) print('using num_cols_list: %s' % num_cols_list) print('using num_clusters_list: %s' % num_clusters_list) print('using num_splits_list: %s' % num_splits_list) print('using engine_binary: %s' % which_engine_binary) time.sleep(2) script_filename = 'hadoop_line_processor.py' # some hadoop processing related settings dirname = 'runtime_analysis' fu.ensure_dir(dirname) temp_dir = tempfile.mkdtemp(prefix='runtime_analysis_', dir=dirname) print('using dir: %s' % temp_dir) # table_data_filename = os.path.join(temp_dir, 'table_data.pkl.gz') input_filename = os.path.join(temp_dir, 'hadoop_input') output_filename = os.path.join(temp_dir, 'hadoop_output') output_path = os.path.join(temp_dir, 'output') parsed_out_file = os.path.join(temp_dir, 'parsed_output.csv') # Hard code the parameter values for now parameter_list = [num_rows_list, num_cols_list, num_clusters_list, num_splits_list] # Iterate over the parameter values and write each run as a line in the hadoop_input file
num_cols_list = args.num_cols_list num_clusters_list = args.num_clusters_list num_splits_list = args.num_splits_list which_engine_binary = args.which_engine_binary # print('using num_rows_list: %s' % num_rows_list) print('using num_cols_list: %s' % num_cols_list) print('using num_clusters_list: %s' % num_clusters_list) print('using num_splits_list: %s' % num_splits_list) print('using engine_binary: %s' % which_engine_binary) time.sleep(2) script_filename = 'hadoop_line_processor.py' # some hadoop processing related settings dirname = 'runtime_analysis' fu.ensure_dir(dirname) temp_dir = tempfile.mkdtemp(prefix='runtime_analysis_', dir=dirname) print('using dir: %s' % temp_dir) # table_data_filename = os.path.join(temp_dir, 'table_data.pkl.gz') input_filename = os.path.join(temp_dir, 'hadoop_input') output_filename = os.path.join(temp_dir, 'hadoop_output') output_path = os.path.join(temp_dir, 'output') parsed_out_file = os.path.join(temp_dir, 'parsed_output.csv') # Hard code the parameter values for now parameter_list = [ num_rows_list, num_cols_list, num_clusters_list, num_splits_list ]