def analyze_experiment(info, experiments_dir, tmp_data_dir, date_str, exp_name): exp_dir = os.path.join(experiments_dir, exp_name) exp_data_dir = os.path.join(tmp_data_dir, exp_name) tmp_analysis_dir = os.path.join(exp_data_dir, 'analysis') idemp_mkdir(tmp_analysis_dir) analyzed_data_dir = info.exp_data_dir(exp_name) if not os.path.exists(analyzed_data_dir): idemp_mkdir(analyzed_data_dir) subprocess.call([os.path.join(exp_dir, 'analyze.sh'), info.exp_config_dir(exp_name), exp_data_dir, tmp_analysis_dir], cwd=exp_dir) status = validate_status(tmp_analysis_dir) # read the analyzed data, append a timestamp field, and copy over to the permanent data dir if status['success']: data_exists = check_file_exists(tmp_analysis_dir, 'data.json') if not data_exists: status = {'success': False, 'message': 'No data.json file produced by {}'.format(exp_name)} else: # collect data to dump to data_*.json dump_data = { 'timestamp' : date_str, } dump_data.update(read_json(tmp_analysis_dir, 'data.json')) # fetch time spent on the experiment dump_data.update(get_timing_info(info, exp_name)) write_json(analyzed_data_dir, 'data_{}.json'.format(date_str), dump_data) info.report_exp_status(exp_name, 'analysis', status) return status['success']
def run_experiment(info, experiments_dir, tmp_data_dir, exp_name): to_local_time = lambda sec: time.asctime(time.localtime(sec)) exp_dir = os.path.join(experiments_dir, exp_name) exp_conf = info.exp_config_dir(exp_name) # set up a temporary data directory for that experiment exp_data_dir = os.path.join(tmp_data_dir, exp_name) idemp_mkdir(exp_data_dir) # Mark the start and the end of an experiment start_time = time.time() start_msg = f'Experiment {exp_name} starts @ {to_local_time(start_time)}' print_log(start_msg) # run the run.sh file on the configs directory and the destination directory subprocess.call([os.path.join(exp_dir, 'run.sh'), exp_conf, exp_data_dir], cwd=exp_dir) end_time = time.time() delta = datetime.timedelta(seconds=end_time - start_time) # collect the status file from the destination directory, copy to status dir status = validate_status(exp_data_dir) # show experiment status to terminal if status['success']: end_msg = f'Experiment {exp_name} ends @ {to_local_time(end_time)}\nTime Delta: {delta}' print_log(end_msg) else: print_log(f'*** {exp_name} FAILED ***\n*** Reason: {status["message"]} ***') # record start & end & duration of an experiment status['start_time'] = to_local_time(start_time) status['end_time'] = to_local_time(end_time) status['time_delta'] = str(delta) # not literally copying because validate may have produced a status that generated an error info.report_exp_status(exp_name, 'run', status) return status['success']
def prepare_exp_data_pages(info, out_dir): idemp_mkdir(out_dir) for exp in info.all_present_experiments(): stage_statuses = info.exp_stage_statuses(exp) if 'analysis' not in stage_statuses or not stage_statuses['analysis'][ 'success']: continue all_exp_data = sort_data(info.exp_data_dir(exp)) # customize the formatting here so that it's at # least somewhat human-readable with open(os.path.join(out_dir, '{}.json'.format(exp)), 'w') as f: json.dump(all_exp_data[::-1], f, indent=1)
def set_up_out_dir(info, out_dir): idemp_mkdir(out_dir) web_graph_dir = os.path.join(out_dir, 'graph') web_data_dir = os.path.join(out_dir, 'data') shutil.rmtree(web_graph_dir, ignore_errors=True) shutil.rmtree(web_data_dir, ignore_errors=True) shutil.copytree(info.exp_graphs, web_graph_dir) if score_successful(info): score_graphs = os.path.join(info.subsys_output_dir('score'), 'graphs') for subdir in os.listdir(score_graphs): full_path = os.path.join(score_graphs, subdir) if not os.path.isdir(full_path): continue shutil.copytree(full_path, os.path.join(web_graph_dir, subdir)) prepare_exp_data_pages(info, web_data_dir)
def main(home_dir, experiments_dir, subsystem_dir, telemetry_script_dir): """ Home directory: Where config info for experiments, etc., is Experiments directory: Where experiment implementations are Both should be given as absolute directories """ time_str = get_timestamp() if not check_file_exists(home_dir, 'config.json'): print('Dashboard config (config.json) is missing in {}'.format(home_dir)) return 1 dash_config = read_json(home_dir, 'config.json') # must expand all tildes in the config to avoid future errors for path_field in ['tmp_data_dir', 'setup_dir', 'backup_dir']: dash_config[path_field] = os.path.expanduser(dash_config[path_field]) tmp_data_dir = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str) data_archive = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str + '_data.tar.gz') setup_dir = dash_config['setup_dir'] backup_archive = os.path.join(dash_config['backup_dir'], 'dashboard_' + time_str + '.tar.gz') idemp_mkdir(tmp_data_dir) idemp_mkdir(os.path.dirname(backup_archive)) idemp_mkdir(setup_dir) info = DashboardInfo(home_dir) # make a backup of the previous dashboard files if they exist if os.path.exists(home_dir): subprocess.call(['tar', '-zcf', backup_archive, home_dir]) # directories whose contents should not change between runs of the dashboard persistent_dirs = {info.exp_data, info.exp_configs, info.subsys_configs, info.subsys_output} all_dashboard_dirs = info.all_experiment_dirs() + info.all_subsystem_dirs() # instantiate necessary dashboard dirs and clean any that should be empty for dashboard_dir in all_dashboard_dirs: if dashboard_dir not in persistent_dirs: subprocess.call(['rm', '-rf', dashboard_dir]) idemp_mkdir(dashboard_dir) randomize_exps = True if 'randomize' in dash_config: randomize_exps = dash_config['randomize'] telemetry_rate = dash_config.get('telemetry_rate', 15) run_cpu_telemetry = dash_config.get('run_cpu_telemetry', False) run_gpu_telemetry = dash_config.get('run_gpu_telemetry', False) run_all_experiments(info, experiments_dir, setup_dir, tmp_data_dir, data_archive, time_str, telemetry_script_dir, run_cpu_telemetry=run_cpu_telemetry, run_gpu_telemetry=run_gpu_telemetry, telemetry_interval=telemetry_rate, randomize=randomize_exps) run_all_subsystems(info, subsystem_dir, time_str)
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) idemp_mkdir(output_dir) for exp_name in info.all_present_experiments(): exp_status = info.exp_status_dir(exp_name) run_status = validate_json(exp_status, 'run_cpu_telemetry', 'run_gpu_telemetry', filename='run.json') if check_prerequisites( info, {exp_name: {}}) == (True, 'success') and run_status.get( 'success', False): telemetry_folder = info.subsys_telemetry_dir(exp_name) if os.path.exists(telemetry_folder): exp_graph_folder = os.path.join(telemetry_folder, 'graph') cpu_stat = info.exp_cpu_telemetry(exp_name) gpu_stat = info.exp_gpu_telemetry(exp_name) cpu_data = sort_data(cpu_stat) gpu_data = sort_data(gpu_stat) graph_folder = info.exp_graph_dir(exp_name) website_include_dir = os.path.join(graph_folder) try: if cpu_data and run_status.get('run_cpu_telemetry', False): visualize( 'cpu', process_cpu_telemetry(cpu_data[-1]), exp_graph_folder, os.path.join(website_include_dir, 'cpu_telemetry'), f'Visualizing CPU telemetry for {exp_name}', lambda adapter, title, *rest: f'{adapter}-{title}') if gpu_data and run_status.get('run_gpu_telemetry', False): visualize( 'gpu', process_gpu_telemetry(gpu_data[-1]), exp_graph_folder, os.path.join(website_include_dir, 'gpu_telemetry'), f'Visualizing GPU telemetry for {exp_name}', lambda _, title, *rest: title) except Exception as e: write_status( output_dir, False, f'Encountered err while generating graphs: {e}') return write_status(output_dir, True, 'success') else: write_status(output_dir, False, 'No telemetry data found') return
def setup_experiment(info, experiments_dir, setup_dir, exp_name): exp_dir = os.path.join(experiments_dir, exp_name) exp_setup_dir = os.path.join(setup_dir, exp_name) # remove the existing setup dir before running the script again subprocess.call(['rm', '-rf', exp_setup_dir]) idemp_mkdir(exp_setup_dir) subprocess.call([os.path.join(exp_dir, 'setup.sh'), info.exp_config_dir(exp_name), exp_setup_dir], cwd=exp_dir) status = validate_status(exp_setup_dir) info.report_exp_status(exp_name, 'setup', status) # if setup succeeded, touch a marker file so we know what time to check for changes if status['success']: subprocess.call(['touch', '.last_setup'], cwd=exp_setup_dir) return status['success']
def visualize(device, data, exp_graph_dir, website_copy_dir, msg='', get_title=lambda *arg: '-'.join(arg)): ts, *data = data current_ts_dir = os.path.join(exp_graph_dir, ts) graph_dir = os.path.join(current_ts_dir, device) idemp_mkdir(graph_dir) idemp_mkdir(website_copy_dir) print(msg) for adapter, title, unit, data in data: generate_graph(ts, get_title(adapter, title, unit, data), title, data, graph_dir, y_label=unit if unit else '', copy_to=[website_copy_dir])
def run_subsystem(info, subsystem_dir, subsys_name): subsys_dir = os.path.join(subsystem_dir, subsys_name) subsys_output_dir = info.subsys_output_dir(subsys_name) idemp_mkdir(subsys_output_dir) # remove the old status if one is hanging around # (subsystem output dirs remain around between runs) if check_file_exists(subsys_output_dir, 'status.json'): subprocess.call(['rm', '-f', os.path.join(subsys_output_dir, 'status.json')]) # run the run.sh file on the configs directory and the output directory subprocess.call([os.path.join(subsys_dir, 'run.sh'), info.subsys_config_dir(subsys_name), info.home_dir, subsys_output_dir], cwd=subsys_dir) # collect the status file from the destination directory, copy to status dir status = validate_status(subsys_output_dir) # not literally copying because validate may have produced a status that generated an error info.report_subsys_status(subsys_name, 'run', status) return status['success']
def main(interval, output_dir, exp_name, run_cpu_telemetry, run_gpu_telemetry): ''' # directory structure: # ./output_dir # -> telemtry # -> char_rnn # -> treelstm ... ''' out_dir = os.path.join(output_dir, 'telemetry') log_dir = os.path.join(out_dir, exp_name) idemp_mkdir(os.path.join(log_dir, 'cpu')) idemp_mkdir(os.path.join(log_dir, 'gpu')) nvidia_fields = 'timestamp,clocks.gr,clocks.current.memory,utilization.gpu,utilization.memory,memory.used,pstate,power.limit,temperature.gpu,fan.speed'.split(',') start_job(log_dir, nvidia_fields, int(interval), 0, run_cpu_telemetry == 'True', run_gpu_telemetry == 'True') time_run = 0 interval = float(interval) while True: start_job(log_dir, nvidia_fields, interval, time_run, run_cpu_telemetry == 'True', run_gpu_telemetry == 'True') time_run += 1 time.sleep(interval)
def process_telemetry_statistics(info, exp_name, output_dir, time_str, cpu_stat_parser=parse_cpu_stat, gpu_stat_parser=parse_gpu_stat): ''' Collect data of telemetry statistics and write to results directory Note: The "parsing" logic procedure written in this file is specialized to deal with telemetry collected at pipsqueak. They are not guaranteed to work on other platforms. ''' telemetry_output_dir = info.subsys_telemetry_dir(exp_name) if not os.path.exists(telemetry_output_dir): idemp_mkdir(telemetry_output_dir) data_dir = os.path.join(output_dir, f'telemetry/{exp_name}') cpu_telemetry_dir = os.path.join(data_dir, 'cpu') gpu_telemetry_dir = os.path.join(data_dir, 'gpu') write_json(os.path.join(telemetry_output_dir, 'gpu'), f'gpu-{time_str}.json', gpu_stat_parser(gpu_telemetry_dir, time_str)) write_json(os.path.join(telemetry_output_dir, 'cpu'), f'cpu-{time_str}.json', cpu_stat_parser(cpu_telemetry_dir, time_str))
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) data_dir = os.path.join(output_dir, 'data') graph_dir = os.path.join(output_dir, 'graphs') idemp_mkdir(data_dir) idemp_mkdir(graph_dir) timestamp = get_timestamp() score_confs = conf['score_confs'] metrics = set(score_confs.keys()) metrics = metrics.intersection(set(SCORE_METRICS.keys())) if not metrics: write_status(output_dir, True, 'No scores to report') return 0 score_data = {} score_reports = {} for metric in metrics: score_metric = SCORE_METRICS[metric](score_confs[metric]) valid, msg = check_prerequisites(info, score_metric.prereq()) if not valid: write_status(output_dir, False, msg) return 1 score_data_dir = os.path.join(data_dir, metric) score_graph_dir = os.path.join(graph_dir, metric) idemp_mkdir(score_data_dir) idemp_mkdir(score_graph_dir) try: report = process_score(info, score_metric, score_data_dir, score_graph_dir, timestamp) score_reports[metric] = report except Exception as e: write_status( output_dir, False, 'Encountered exception while scoring {}:\n{}'.format( metric, render_exception(e))) return 1 report = {'title': 'Metric Scores', 'value': format_scores(score_reports)} write_json(output_dir, 'report.json', report) write_status(output_dir, True, 'success')
def export_mxnet_model(cell_type, setup_dir): # batch and seq_len are placeholder, and don't affect the exported model ctx = mx.context.cpu() dtype = 'float32' model = RNNModel(cell_type, INPUT_SIZE, HIDDEN_SIZE) if cell_type == 'rnn' or cell_type == 'gru': states = [mx.nd.zeros((BATCH, HIDDEN_SIZE), dtype=dtype, ctx=ctx)] elif cell_type == 'lstm': states = [ mx.nd.zeros((BATCH, HIDDEN_SIZE), dtype=dtype, ctx=ctx), mx.nd.zeros((BATCH, HIDDEN_SIZE), dtype=dtype, ctx=ctx) ] xs = mx.nd.random.uniform(shape=(SEQ_LEN, BATCH, INPUT_SIZE), dtype=dtype, ctx=ctx) model.collect_params().initialize(ctx=ctx) model.hybridize() model(xs, states) idemp_mkdir(os.path.join(setup_dir, 'mxnet')) fname = os.path.join(setup_dir, 'mxnet', model_filename(cell_type)) model.export(fname, epoch=1) print('Export MXNet model to %s' % fname) return fname