def run_trials(method, task_name, dry_run, times_per_input, n_input, trial, trial_setup, trial_teardown, parameter_names, parameter_ranges, path_prefix='', append_to_csv=False): try: filename = os.path.join(path_prefix, '{}-{}.csv'.format(method, task_name)) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) mode = 'a' if append_to_csv else 'w' with open(filename, mode, newline='') as csvfile: fieldnames = parameter_names + ['rep', 'run', 'time'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if not append_to_csv: writer.writeheader() for args in product(*parameter_ranges): costs = [] for t in range(n_input): score = 0.0 try: trial_args = trial_setup(*args) score = _score_loop(t, trial, trial_args, list(args), times_per_input, dry_run, writer, fieldnames) trial_teardown(*trial_args) except Exception as e: # can provide more detailed summary if # it happened inside a trial return ( False, 'Encountered exception in trial on inputs {}:\n'. format(args) + render_exception(e)) if t != n_input - 1: time.sleep(4) costs.append(score) print(method, task_name, args, ["%.6f" % x for x in costs]) return (True, 'success') except Exception as e: return (False, 'Encountered exception:\n' + render_exception(e))
def main(config_dir, output_dir): try: config, msg = validate_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 if check_early_exit is not None: early_exit, msg = check_early_exit(config) if early_exit: write_status(output_dir, True, msg) return 0 configure_seed(config) if gen_trial_params is None: write_status(output_dir, True, 'No trial to run') return 0 trial_params = gen_trial_params(config) success, msg = run_trials(*trial_params, path_prefix=output_dir) write_status(output_dir, success, msg) return 0 if success else 1 except Exception as e: write_status(output_dir, False, render_exception(e)) return 1
def eval_command(model, exp_config, config, config_dir, output_dir, cmd_id): try: if exp_config.get('kind') == 'ratio': success, result = run_baseline(model, exp_config, config, config_dir, output_dir) if not success: return False, result # the actual memory budget calculation is # in `unfold_settings` exp_config['memory_budget'] = result first_time = True conf_cnt = 0 for combo in unfold_settings(exp_config): success, msg = run_trials(config_dir, python_command(combo['type'], config), combo['type'], model, combo, config['n_inputs'], config['n_reps'], output_dir, report_errors=config['report_errors'], append_to_csv=False, trial_run=False, cmd_id=cmd_id, conf_cnt=conf_cnt) if not success: return False, msg conf_cnt += 1 return True, 'success' except Exception as e: return (False, 'Encountered outer iteration exception:\n' + render_exception(e))
def write_generic_summary(data_dir, output_dir, title, devices, networks=None, use_networks=False): """ Given a data directory and output directory, this function writes a generic summary assuming that the data has a field keyed by device (cpu/gpu) and optionally by network. It writes a summary and status to the output dir. """ try: all_data = sort_data(data_dir) most_recent = all_data[-1] summary = None if use_networks: summary = summary_by_dev_and_network(most_recent, devices, networks) else: summary = summary_by_dev(most_recent, devices) write_summary(output_dir, title, summary) write_status(output_dir, True, 'success') # TODO do something about comparisons to previous days except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e))
def post_message(client, channel, message, **kargs): """ Attempts posting the given message object to the Slack channel. Returns whether it was successful and a message. """ try: resp = [] if isinstance(channel, list): for ch in channel: resp.append( client.chat_postMessage( channel=ch, text=message.get('text', '*No Message Content*'), attachments=message.get('attachments', ''), **kargs)) else: resp.append( client.chat_postMessage( channel=channel, text=message.get('text', '*No Message Content*'), attachments=message.get('attachments', ''), **kargs)) return (True, resp, 'success') except Exception as e: return (False, None, 'Encountered exception:\n' + render_exception(e))
def main(data_dir, config_dir, output_dir): try: config, msg = validate(config_dir) if config is None: write_status(output_dir, False, msg) return 1 summary = {} for model in sorted(config['models']): summary[model] = [] # the script will not be run if there is an error cmd_id = 0 for _, _, exp_config in parse_commands(model, config): for combo in unfold_settings(exp_config): stats, msg = parse_data_file(exp_config['type'], model, config, combo, data_dir, cmd_id) if stats is None: write_status(output_dir, False, msg) return 1 stats['command_id'] = cmd_id summary[model].append(stats) cmd_id += 1 write_json(output_dir, 'data.json', summary) write_status(output_dir, True, 'success') except Exception as e: write_status(output_dir, False, render_exception(e))
def main(config_dir, setup_dir): try: export_mxnet_model('rnn', setup_dir) export_mxnet_model('gru', setup_dir) export_mxnet_model('lstm', setup_dir) write_status(setup_dir, True, 'success') except Exception as e: write_status(setup_dir, False, render_exception(e))
def post_message(webhook_url, message): """ Attempts posting the given message object to the Slack webhook URL. Returns whether it was successful and a message. """ try: r = requests.post(webhook_url, json=message) return (True, 'success') except Exception as e: return (False, 'Encountered exception:\n' + render_exception(e))
def render_fixed(model_name, output_dir, x_axis, dtr_entries, failed_trials): if not (dtr_entries or failed_trials): return (True, 'nothing to render') filename = prepare_out_file( output_dir, f'{name_dict.get(model_name, model_name)}-fixed-gpu-time.png') try: plt.clf() plt.style.use('seaborn-paper') plt.rcParams["font.size"] = 30 fig = plt.figure() fig.add_subplot(111, frameon=False) fig.set_size_inches(12, 7) plt.xticks(fontsize=13) plt.yticks(fontsize=13) plt.xlabel('Memory Budget (MB)', fontsize=15, labelpad=10) plt.ylabel(r'Compute Time (ms)', fontsize=15, labelpad=10) plt.title(f'{name_dict.get(model_name, model_name)} GPU Time', fontsize=18) plt.grid(True) ax = plt.gca() if dtr_entries: lin, = ax.plot(x_axis, dtr_entries, color=color_scheme.get(model_name, 'black'), linewidth=4) mk, = ax.plot(x_axis, dtr_entries, label=name_dict.get(model_name, model_name), linewidth=4, marker=marker_scheme.get(model_name, '+'), ms=12, alpha=.6, color=color_scheme.get(model_name, 'black')) ax.legend([(lin, mk)], ['merged']) if failed_trials: plt.axvline(x=max(failed_trials), color=color_scheme.get(model_name, 'black'), linestyle='dashed') plt.legend(bbox_to_anchor=(0.5, 0.01), loc='lower center', bbox_transform=fig.transFigure, ncol=7, borderaxespad=0, prop={'size': 15}) plt.tight_layout() plt.savefig(filename, bbox_inches='tight') return (True, 'success') except Exception as e: raise e return (False, render_exception(e))
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] pass_spec_name_map = { '3;FuseOps': 'Op Fusion', '3;FoldConstant|FuseOps': '... + Constant Folding', '3;EliminateCommonSubexpr|FoldConstant|FuseOps': '... + Common Subexpr Elim', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldConstant|FuseOps': '... + Parallel Conv Comb', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|FoldConstant|FuseOps': '... + Axis Scale Folding', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|FoldConstant|FuseOps': '... + Cast Canonicalization', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|FoldConstant|FuseOps': '... + Op Canonicalization', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|AlterOpLayout|FoldConstant|FuseOps': '... + Op Layout Alteration' } prereqs, msg = check_prerequisites(info, { 'pass_comparison': { 'networks': networks, 'passes': [ parse_combo(combo) for combo in pass_spec_name_map.keys() ] } }) all_data = sort_data(info.exp_data_dir('pass_comparison')) raw_data = all_data[-1] baseline = '0;' network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } del raw_data['timestamp'] del raw_data['tvm_hash'] try: for (dev, raw_dev_data) in raw_data.items(): plot_data = OrderedDict([ (pass_spec_name_map[pass_spec], { network_name_map[network]: raw_dev_data[baseline][network] / raw_dev_data[pass_spec][network] for network in networks}) for pass_spec in pass_spec_name_map.keys() ]) generate_pass_comparisons(plot_data, output_dir, f'pass-comp-{dev}.png') except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) our_name = 'Relay' if 'our_name' in conf: our_name = conf['our_name'] conf_fws = ['relay', 'pt', 'tf', 'mxnet', 'nnvm'] networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] prereqs, msg = check_prerequisites( info, { 'cnn_comp': { 'devices': ['gpu'], 'use_xla': True, 'networks': networks, 'frameworks': conf_fws } }) if not prereqs: write_status(output_dir, False, msg) return 1 all_data = sort_data(info.exp_data_dir('cnn_comp')) raw_data = all_data[-1]['gpu'] our_fw = 'Relay' other_fws = ['TensorFlow', 'Pytorch', 'MxNet', 'NNVM', 'TF XLA'] fw_name_map = {fw: fw for fw in other_fws} fw_name_map['Pytorch'] = 'PyTorch' networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } plot_data = OrderedDict([(fw_name_map[fw], { network_name_map[network]: raw_data[fw][network] / raw_data[our_fw][network] for network in networks }) for fw in other_fws]) try: generate_vision_comparisons(our_name, plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def render_field(model_name, output_dir, title, filename, x_label, y_label, x_axis, baseline_entries, dtr_entries, failed_trials, confidence=None, suptitle=''): if not (dtr_entries or baseline_entries or failed_trials): return (True, 'nothing to render') file = prepare_out_file(output_dir, filename) try: # min_x = min(*(x_axis + failed_trials)) # max_x = max(*(x_axis + failed_trials)) ax = plt.gca() if dtr_entries: lin, = ax.plot(x_axis, dtr_entries, color=COLOR_SCHEME.get(model_name, 'black'), linewidth=4) mk, = ax.plot(x_axis, dtr_entries, label=NAME_DICT.get(model_name, model_name), linewidth=4, marker=MARKER_SCHEME.get(model_name, '+'), ms=12, alpha=.6, color=COLOR_SCHEME.get(model_name, 'black')) if confidence: render_errorbars(ax, x_axis, dtr_entries, confidence) ax.legend([(lin, mk)], ['merged']) # if baseline_entries: # plt.hlines(y=baseline_entries[0], xmin=min_x, xmax=max_x, linewidth=3, # label='Baseline', color='blue', linestyles='dashed') if failed_trials: plt.axvline(x=max(failed_trials), color=COLOR_SCHEME.get(model_name, 'black'), linestyle='dashed') # fig = plt.legend().figure # fig.savefig(file) return (True, 'success') except Exception as e: raise e return (False, 'Exception encountered while rendering graph: {}'.format( render_exception(e)))
def main(data_dir, config_dir, output_dir): try: config, msg = validate_trials_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 summary = {} baseline_dict = {} for model in sorted(config['models']): summary[model] = [] baseline_dict[model] = {} # the script will not be run if there is an error cmd_id = 0 for _, _, exp_config in parse_commands(model, config): baseline_params = None for specific_params in unfold_settings(exp_config): batch_size = specific_params['batch_size'] if specific_params['type'] == 'baseline': baseline_dict[model][batch_size] = { 'type': 'baseline', 'specific_params': specific_params, 'cmd_id': cmd_id } # if there is a corresponding baseline, # let's match using the dict baseline_params = None if (batch_size in baseline_dict[model] and specific_params['type'] != 'baseline'): baseline_params = baseline_dict[model][batch_size] stats, msg = parse_data_file( exp_config['type'], model, config, specific_params, data_dir, cmd_id, baseline_params=baseline_params) if stats is None: write_status(output_dir, False, msg) return 1 stats['command_id'] = cmd_id summary[model].append(stats) cmd_id += 1 write_json(output_dir, 'data.json', summary) write_status(output_dir, True, 'success') except Exception as e: write_status(output_dir, False, render_exception(e))
def process_score(info, score_metric, data_dir, graph_dir, timestamp): data = score_metric.compute_score(info) data['timestamp'] = timestamp write_json(data_dir, 'data_{}.json'.format(timestamp), data) # graphs failing is not a fatal error, just an inconvenience try: score_metric.score_graph(data, graph_dir) all_data = sort_data(data_dir) score_metric.longitudinal_graphs(all_data, graph_dir) except Exception as e: print(render_exception(e)) finally: return score_metric.score_text(data)
def upload_image(client, channels, file_path, description, **kargs): """ Attempts to upload an image to a channel """ try: if isinstance(channels, list): channels = ','.join(channels) resp = client.files_upload(channels=channels, file=file_path, title=description, **kargs) return (True, resp, 'success') except Exception as e: return (False, None, 'Encountered exception:\n' + render_exception(e))
def main(data_dir, config_dir, output_dir): try: config, msg = validate_trials_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] success, msg = render_graph(config, most_recent, output_dir) write_status(output_dir, success, msg) except Exception as e: write_status(output_dir, False, 'Exception encountered: ' + render_exception(e)) return 1 finally: plt.close()
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) data_dir = os.path.join(output_dir, 'data') graph_dir = os.path.join(output_dir, 'graphs') idemp_mkdir(data_dir) idemp_mkdir(graph_dir) timestamp = get_timestamp() score_confs = conf['score_confs'] metrics = set(score_confs.keys()) metrics = metrics.intersection(set(SCORE_METRICS.keys())) if not metrics: write_status(output_dir, True, 'No scores to report') return 0 score_data = {} score_reports = {} for metric in metrics: score_metric = SCORE_METRICS[metric](score_confs[metric]) valid, msg = check_prerequisites(info, score_metric.prereq()) if not valid: write_status(output_dir, False, msg) return 1 score_data_dir = os.path.join(data_dir, metric) score_graph_dir = os.path.join(graph_dir, metric) idemp_mkdir(score_data_dir) idemp_mkdir(score_graph_dir) try: report = process_score(info, score_metric, score_data_dir, score_graph_dir, timestamp) score_reports[metric] = report except Exception as e: write_status( output_dir, False, 'Encountered exception while scoring {}:\n{}'.format( metric, render_exception(e))) return 1 report = {'title': 'Metric Scores', 'value': format_scores(score_reports)} write_json(output_dir, 'report.json', report) write_status(output_dir, True, 'success')
def main(data_dir, config_dir, output_dir): try: config, msg = validate_trials_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] summary = summarize(config, most_recent) write_summary(output_dir, 'Pareto Curve Trial', summary) write_status(output_dir, True, 'success') except Exception as e: write_status(output_dir, False, 'Exception encountered: ' + render_exception(e)) return 1
def main(data_dir, config_dir, output_dir): config, msg = validate(config_dir) if config is None: write_status(output_dir, False, msg) return 1 # read in data, output graphs of most recent data, and output longitudinal graphs all_data = sort_data(data_dir) most_recent = all_data[-1] try: generate_longitudinal_comparisons(all_data, output_dir) generate_arm_vta_comparisons(most_recent, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def trials_stat_summary(data_dir, framework, task_name, num_reps, parameter_names, params_to_match): """ Returns a full summary of statistics on the specified framework and task across all reps where the specified parameters match. Returns (summary, success, message) """ try: data = obtain_data_rows(data_dir, framework, task_name, parameter_names, params_to_match) summary = summarize_over_reps(data, num_reps) return (summary, True, 'success') except Exception as e: return (-1, False, 'Encountered exception on {}, {} using params {}:\n{}'.format( framework, task_name, params_to_match, render_exception(e)))
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] prereqs, msg = check_prerequisites( info, { 'relay_opt': { 'devices': ['gpu'], 'opt_levels': [0, 1, 2, 3, 4], 'networks': networks } }) if not prereqs: write_status(output_dir, False, msg) return 1 all_data = sort_data(info.exp_data_dir('relay_opt')) raw_data = all_data[-1]['gpu'] baseline = 'O0' opts = ['O1', 'O2', 'O3', 'O4'] network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } plot_data = OrderedDict([(opt, { network_name_map[network]: raw_data[baseline][network] / raw_data[opt][network] for network in networks }) for opt in opts]) try: generate_opt_comparisons(plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def eval_command(model, exp_config, config, config_dir, output_dir, cmd_id): try: if exp_config.get('kind') == 'ratio': success, result = run_baseline(model, exp_config, config, config_dir, output_dir) if not success: return False, result # the actual memory budget calculation is # in `unfold_settings` exp_config['memory_budget'] = result # if there is a sampling cutoff and it's given as a ratio, # convert it to a budget cutoff if 'no_sampling_below_ratio' in exp_config: threshold_ratio = exp_config['no_sampling_below_ratio'] if threshold_ratio != -1: exp_config['no_sampling_below_budget'] = threshold_ratio*result conf_cnt = 0 for combo in unfold_settings(exp_config): success, msg = run_trials(config_dir, python_command(combo['type'], config), combo['type'], model, combo, config['n_inputs'], output_dir, report_errors=config['report_errors'], append_to_csv=False, trial_run=False, cmd_id=cmd_id, conf_cnt=conf_cnt, sync_gpu=config['sync_gpu']) if not success: return False, msg conf_cnt += 1 return True, 'success' except Exception as e: return (False, 'Encountered outer iteration exception:\n' + render_exception(e))
def main(data_dir, config_dir, output_dir): try: config, msg = validate_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] last_two_weeks = [ entry for entry in all_data if time_difference(most_recent, entry).days < 14 ] generate_longitudinal_comparisons(all_data, output_dir, 'all_time') generate_longitudinal_comparisons(last_two_weeks, output_dir, 'two_weeks') generate_individual_comparisons(config, most_recent, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def render_fixed(ax, model_name, output_dir, x_axis, dtr_entries, baseline_data, failed_trials, batch_size=None, confidence=None): if not (dtr_entries or failed_trials): return (True, 'nothing to render') filename = prepare_out_file( output_dir, f'{NAME_DICT.get(model_name, model_name)}-fixed-gpu-time.png') try: # plt.style.use('seaborn-paper') # plt.rcParams["font.size"] = 30 # fig = plt.figure() # fig.add_subplot(111, frameon=False) # fig.set_size_inches(12, 7) # plt.xticks(fontsize=13) # plt.yticks(fontsize=13) # plt.xlabel('Memory Budget (MB)', fontsize=15, labelpad=10) # plt.ylabel(r'Compute Time (ms)', fontsize=15, labelpad=10) # plt.title(f'{NAME_DICT.get(model_name, model_name)} GPU Time', fontsize=18) # plt.grid(True) # ax = plt.gca() width = 0.0 all_axis = sorted(x_axis + failed_trials) ind = np.arange(len(all_axis) + 1) ind_index = dict(zip(all_axis, ind)) ind_pos = dict([(ind[i], i) for i in range(len(ind))]) ax.set_xticks(ind + width / 2) ax.set_xticklabels( map(lambda x: f'{round(x * 1e-9, 1)}', all_axis + [baseline_data['mem'] * 1e+6])) ax.tick_params(axis='both', labelsize=20) filtered_entries = [] if baseline_data and 'cpu_time' in baseline_data: for (x, datum) in zip(x_axis, dtr_entries): if not datum.get( 'error', False) and 'cpu_time' in datum and datum[ 'cpu_time'] > 3 * baseline_data['cpu_time']: failed_trials.append(x) filtered_entries.append({key: 0 for key in datum.keys()}) else: filtered_entries.append(datum) dtr_entries = filtered_entries if failed_trials: for x in failed_trials: ax.axvline(x=ind_index[x], color='red', linestyle='dashed', label='OOM') new_ind = [] for x in x_axis: new_ind.append(ind_index[x]) new_ind.append(ind[-1]) ind = np.array(new_ind) ax.grid(True, axis='y') ax.set_title( f'{NAME_DICT.get(model_name, model_name)} ({batch_size})\n{input_sizes.get(model_name, "")}', fontsize=15) for x in failed_trials: ax.bar(ind_index[x], 0) if dtr_entries: # lin, = ax.plot(x_axis, dtr_entries, color=COLOR_SCHEME.get(model_name, 'black'), linewidth=4) # mk, = ax.plot(x_axis, dtr_entries, label=NAME_DICT.get(model_name, model_name), # linewidth=4, marker=MARKER_SCHEME.get(model_name, '+'), ms=12, # alpha=.6, color=COLOR_SCHEME.get(model_name, 'black')) data_collection = {key: [] for key in timed_keys} data_collection['dispatch_overhead'] = [] for entry in dtr_entries: acc = 0 for (k, v) in entry.items(): if k != 'cpu_time': data_collection[k].append(v) acc += v data_collection['dispatch_overhead'].append(entry['cpu_time'] - acc) acc = np.zeros(len(x_axis)) for k in timed_keys + ['dispatch_overhead']: # print(ind[:-1], data_collection[k]) ax.bar(ind[:-1], data_collection[k], label=breakdown_namedict.get(k, k), color=breakdown_color_scheme.get(k, 'red'), bottom=acc) acc = acc + data_collection[k] if baseline_data and 'cpu_time' in baseline_data: ax.bar([ind[-1]], baseline_data['cpu_time'], label='Unmodified\nPyTorch', color='blue') else: ax.bar([ind[-1]], 0, label='Unmodified PyTorch', color='blue') ax.axvline(ind[-1], color='red', linestyle='dashed', label='OOM') if confidence and False: render_errorbars(ax, x_axis, dtr_entries, confidence) ax.invert_xaxis() # ax.legend([(lin, mk)], ['merged']) # plt.legend( # bbox_to_anchor=(0.5,0.01), # loc='lower center', # bbox_transform=fig.transFigure, # ncol=7, # borderaxespad=0, # prop={'size': 15} # ) # plt.tight_layout() # plt.savefig(filename, bbox_inches = 'tight') return (True, 'success') except Exception as e: raise e return (False, render_exception(e))
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) our_name = 'Relay' if 'our_name' in conf: our_name = conf['our_name'] prereqs, msg = check_prerequisites( info, { 'treelstm': { 'devices': ['cpu'], 'frameworks': ['relay', 'pt'], 'relay_methods': ['aot'] }, 'char_rnn': { 'devices': ['cpu'], 'frameworks': ['relay', 'pt'], 'relay_methods': ['aot'], 'relay_configs': ['loop'] }, 'gluon_rnns': { 'devices': ['cpu'], 'frameworks': ['relay', 'mxnet'], 'networks': ['rnn', 'lstm', 'gru'], 'relay_methods': ['aot'] } }) if not prereqs: write_status(output_dir, False, msg) return 1 raw_data = {} for exp in ['treelstm', 'char_rnn', 'gluon_rnns']: all_data = sort_data(info.exp_data_dir(exp)) raw_data[exp] = all_data[-1] plot_data = OrderedDict([ ('MxNet', { 'RNN': raw_data['gluon_rnns']['cpu']['MxNet']['rnn'] / raw_data['gluon_rnns']['cpu']['Aot']['rnn'], 'GRU': raw_data['gluon_rnns']['cpu']['MxNet']['gru'] / raw_data['gluon_rnns']['cpu']['Aot']['gru'], 'LSTM': raw_data['gluon_rnns']['cpu']['MxNet']['lstm'] / raw_data['gluon_rnns']['cpu']['Aot']['lstm'], 'CharRNN': 0.0, 'TreeLSTM': 0.0, }), ('PyTorch', { 'RNN': 0.0, 'GRU': 0.0, 'LSTM': 0.0, 'CharRNN': raw_data['char_rnn']['cpu']['Pytorch'] / raw_data['char_rnn']['cpu']['Aot'], 'TreeLSTM': raw_data['treelstm']['cpu']['Pytorch'] / raw_data['treelstm']['cpu']['Aot'], }), ]) try: generate_nlp_comparisons(our_name, plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def render_throughput_breakdown(metadata, output_dir): throughput_metadata = {} # Gather data to render # a mapping that has the type model -> exp_type -> batch_size -> data dict def get_throughput_metadata(model, batch_size, dtr_dict, baseline_dict, output_dir): if model not in throughput_metadata: throughput_metadata[model] = {'dtr': {}, 'baseline': {}} throughput_metadata[model]['dtr'][batch_size] = [] for datum in dtr_dict[batch_size]['param_sweep']: throughput_metadata[model]['dtr'][batch_size].append({ 'memory_budget': datum.get('memory_budget', -1), 'error': datum['error'], **{key: datum.get(key) for key in used_keys} }) if batch_size in baseline_dict: throughput_metadata[model]['baseline'][batch_size] = { key: baseline_dict[batch_size][key] for key in used_keys } else: throughput_metadata[model]['baseline'][batch_size] = { key: 0 for key in used_keys } return True, 'success' traverse_field(metadata, 'param_sweep', get_throughput_metadata, output_dir) flip = lambda f: lambda x: lambda y: f(y, x) # Plot throughput and time breakdown of a model def plot_model(model): filename = prepare_out_file(output_dir, f'throughput-comparison-{model}.png') plt.clf() plt.grid(True) plt.title(f'Throughput Comparison of {NAME_DICT.get(model, model)}') plt.xlabel('Batch Size', fontsize=15, labelpad=10) plt.ylabel('Throughput (Batch Size / Avg GPU Time (s))') num_batch_size = len(throughput_metadata[model]['dtr'].keys()) baseline_data = metadata[model]['baseline'] width = 0.15 ind = np.arange(num_batch_size) x_axis = list(sorted(throughput_metadata[model]['dtr'].keys())) # Wish we had currying !!! # If baseline data does not contain a batch size, then we fill 0 into the data, since it means baseline failed (OOMed) baseline_data = list( map(flip(throughput_metadata[model]['baseline'].get)(0), x_axis)) # Bar for baseline plt.bar(ind, [datum['throughput'] for datum in baseline_data], width, label='Baseline') dtr_data = {'throughput': {}, 'breakdown': {}} # Gather information collected # the structure of dtr_data: # Level 0: 'breakdown' | 'throughput' # Level 1: data dictionary | computed throughput (float) # Level 3: same as dictionaries processed in fill_data for x in x_axis: for datum in throughput_metadata[model]['dtr'][x]: if datum['memory_budget'] not in dtr_data['throughput']: dtr_data['throughput'][datum['memory_budget']] = [] dtr_data['breakdown'][datum['memory_budget']] = [] dtr_data['throughput'][datum['memory_budget']].append( datum['throughput'] if not datum['error'] else 0) dtr_data['breakdown'][datum['memory_budget']].append( dict(filter(lambda x: x[0] != 'throughput', datum.items()) ) if not datum['error'] else None) num_budget = len(dtr_data['throughput'].keys()) plt.xticks(ind + width * (num_budget / 2), map(str, x_axis)) for (i, (budget, throughput)) in enumerate( sorted(dtr_data['throughput'].items(), key=lambda x: -x[0])): plt.bar(ind + width * (i + 1), throughput, width, label=f'{round(budget * 1e-9, 1)} GiB') plt.legend(loc='best') plt.tight_layout() plt.savefig(filename, bbox_inches='tight') # Plot runtime profiling breakdown filename = prepare_out_file(output_dir, f'time-breakdown-{model}.png') plt.clf() plt.title(f'Runtime Breakdown of {NAME_DICT.get(model, model)}') plt.xlabel('Batch Size') plt.ylabel('Time / Batch (ms)') x_ticks_loc = { ind[i] + width * (num_budget / 2): '\n\n' + str(x_axis[i]) for i in range(num_batch_size) } plt.grid(True, axis='y') for (i, (budget, datum)) in enumerate( sorted(dtr_data['breakdown'].items(), key=lambda x: -x[0])): locs = ind + width * (i + 1) for loc in locs: x_tick = f'{round(budget * 1e-9, 1)}\nGiB' if loc in x_ticks_loc.keys(): x_tick += f'\n{x_ticks_loc[loc]}' x_ticks_loc[loc] = x_tick if datum is None: continue gathered_data = {key: [] for key in (timed_keys + ['cpu_time'])} gathered_data['dispatch_overhead'] = [] for e in datum: time_acc = 0 for key in gathered_data.keys(): if key != 'dispatch_overhead': if e is None: gathered_data[key].append(0) else: gathered_data[key].append(e[key]) if key != 'cpu_time' and e is not None: time_acc += e[key] if e is not None: gathered_data['dispatch_overhead'].append( gathered_data['cpu_time'][-1] - time_acc) else: gathered_data['dispatch_overhead'].append(0) height_acc = np.zeros(len(datum)) for key in timed_keys: # + ['dispatch_overhead']: if i == 0: plt.bar(ind + width * (i + 1), gathered_data[key], width=width, label=breakdown_namedict[key], color=breakdown_color_scheme[key], bottom=height_acc) else: plt.bar(ind + width * (i + 1), gathered_data[key], width=width, color=breakdown_color_scheme[key], bottom=height_acc) height_acc += gathered_data[key] xticks_data = list(sorted(x_ticks_loc.items(), key=lambda x: -x[0])) ticks = list(map(lambda x: x[0], xticks_data)) labels = list(map(lambda x: x[1], xticks_data)) plt.xticks(ticks, labels) plt.legend(loc='best') plt.tight_layout() plt.savefig(filename, bbox_inches='tight') try: for model in throughput_metadata.keys(): plot_model(model) except Exception as e: return False, render_exception(e) return True, 'success'
def run_trials(config_dir, python_cmd, experiment_name, model_name, specific_params, n_inputs, n_reps, path_prefix, report_errors=False, append_to_csv=False, trial_run=False, trial_run_outfile='', cmd_id=0, conf_cnt=0): """ Responsible for recording the time and max memory usage from running a model (the user must provide a lambda for actually running the model because different kinds of models need different kinds of setup and a lambda that generates an input for running that model) :params: trial_run: When set to true, no persistent experiment data will be saved. It is used to run a baseline trial and record how much memory is used then set the memory budget for `ratio` commands of DTR experiments trial_run_out_file: the temporary file that stores the memory usage data of the baseline run cmd_id: the command id for current model, starting from 0 by default conf_cnt: the id of confguration generated from `unfold_settings`; this is used for tracking which exact configuration that caused errors. """ try: cwd = os.getcwd() params_file = 'specific_params.json' try: write_json(cwd, params_file, specific_params) if not trial_run: filename = prepare_out_file( path_prefix, '{}-{}.csv'.format( get_report_prefix(experiment_name, specific_params, cmd_id), model_name)) mode = 'a' if append_to_csv else 'w' with open(filename, mode, newline='') as csvfile: writer = create_csv_writer(csvfile, specific_params) if not append_to_csv: writer.writeheader() else: filename = '' shared_dir = os.path.dirname(os.path.abspath(__file__)) run_script = os.path.join(shared_dir, 'run_torch_trial.py') for i in range(n_inputs): try: subprocess.run([ python_cmd, run_script, '--config-dir', config_dir, '--experiment-mode', experiment_name, '--model-name', model_name, '--input-idx', str(i), '--params-file', params_file, '--out-file', filename, '--trial-run', str(trial_run), '--trial-run-outfile', trial_run_outfile ], check=True, timeout=specific_params.get('timeout', 60)) except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: if not report_errors: raise e if trial_run: return (False, 'Baseline failed: {}'.format( render_exception(e))) log_error(experiment_name, model_name, specific_params, i, render_exception(e), path_prefix) return (True, 'successfully caught error') time.sleep(4) return (True, 'success') finally: os.remove(params_file) except Exception as e: return (False, 'Encountered exception on ({}, {}, {}):\n'.format( experiment_name, model_name, specific_params) + render_exception(e))
def parse_data_file(experiment_name, model, config, specific_params, path_prefix, cmd_id=0): """ Given an experiment name, model name, directory, and number of inputs, parses the corresponding data file if it exists and computes summary statistics for the (wall-clock) time, GPU time, and memory used in that data file for choice of specific settings Returns None and an error message if it fails """ try: filename = '{}-{}.csv'.format( get_report_prefix(experiment_name, specific_params, cmd_id), model) if not check_file_exists(path_prefix, filename): return (None, 'Data file {} does not exist at {}'.format( filename, path_prefix)) full_path = os.path.join(path_prefix, filename) report_errors = config['report_errors'] metrics = {} memory_budget = None with open(full_path, 'r', newline='') as csvfile: reader = csv.DictReader(csvfile) for row in reader: # In case of there are commands for the same model # that have the same values for all configurations idx = int(row['input']) measured = {key: float(row[key]) for key in MEASURED_KEYS} if memory_budget is None and specific_params.get( 'kind') == 'ratio': memory_budget = float(row['memory_budget']) specific_params['memory_budget'] = memory_budget if idx not in metrics.keys(): metrics[idx] = {key: [] for key in MEASURED_KEYS} for key in MEASURED_KEYS: metrics[idx][key].append(measured[key]) summary = {'specific_params': specific_params} # in case everything errored out, this ensure that we will have a record of the error if report_errors: if check_error(experiment_name, model, specific_params, path_prefix): summary['summary'] = 'error' return summary, 'success' summary_stats = [] for (_, stat) in metrics.items(): summary_stats.append({ key: compute_summary_stats(stat[key]) for key in MEASURED_KEYS }) summary['summary'] = summary_stats return (summary, 'success') except Exception as e: return (None, 'Encountered exception on ({}, {}): '.format( experiment_name, model) + render_exception(e))
def render_graph(config, data, output_dir): try: plt.style.use('seaborn-paper') plt.rcParams["font.size"] = 30 fig = plt.figure() fig.add_subplot(111, frameon=False) fig.set_size_inches(12, 7) plt.xticks(fontsize=13) plt.yticks(fontsize=13) plt.xlabel('Memory Budget (Ratio)', fontsize=15, labelpad=10) plt.ylabel(r'Overhead Slow Down ($\times$)', fontsize=15, labelpad=10) plt.title('GPU Time Comparisons', fontsize=18) plt.grid(True) filename = prepare_out_file(output_dir, f'combined-comparison-ratio.png') metadata = {} for model in config['models']: dtr_dict = {} baseline_dict = {} stats = data[model] for stat in stats: if stat['specific_params']['type'] == 'baseline': baseline_dict = fill_data(baseline_dict, stat) else: dtr_dict = fill_data(dtr_dict, stat) metadata[model] = {'baseline': baseline_dict, 'dtr': dtr_dict} success, msg = traverse_field(metadata, 'ratio', lambda model, batch_size, dtr_dict, baseline_dict, output_dir:\ render_time_comparison(model, batch_size, 'ratio', dtr_dict[batch_size]['ratio'], baseline_dict.get(batch_size, {}), output_dir), output_dir) if not success: return (False, msg) plt.hlines(y=1, xmin=0.0, xmax=1.0, linewidth=3, label='Baseline', color='blue', linestyles='dashed') plt.legend(bbox_to_anchor=(0.5, 0.01), loc='lower center', bbox_transform=fig.transFigure, ncol=7, borderaxespad=0, prop={'size': 15}) plt.tight_layout() # plt.savefig(filename, bbox_inches = 'tight') plt.clf() plt.rcParams["font.size"] = 30 figure, axs = plt.subplots(2, 4, figsize=(20, 8)) # figure.set_size_inches(24, 12) axs = reversed(flatten(axs)) success, msg = traverse_field(metadata, 'fixed', lambda model, batch_size, dtr_dict, baseline_dict, output_dir:\ render_time_comparison(model, batch_size, 'fixed', dtr_dict[batch_size]['fixed'], baseline_dict.get(batch_size, {}), output_dir, plt_ax=next(axs)), output_dir) filename = prepare_out_file(output_dir, 'combined-breakdown-comparison.png') # figure.tight_layout() # plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) # plt.xlabel('Memory Budget (GiB)') # plt.ylabel("Time (ms)") figure.text(0.5, 0.02, r'\textbf{\Huge Memory Budget (GiB)}', ha='center') figure.text(0.09, 0.5, r'\textbf{\Huge Time (ms) / Batch}', ha='center', va='center', rotation='vertical') plt.legend(bbox_to_anchor=(0.17, 0.075), loc='upper left', bbox_transform=fig.transFigure, ncol=6, borderaxespad=0, prop={'size': 15}) # figure.tight_layout() # plt.tight_layout() # plt.tight_layout(h_pad=0.3) plt.subplots_adjust(hspace=0.4) plt.savefig(filename, bbox_inches='tight', pad_inches=0.4) if not success: return (False, msg) success, msg = render_throughput_breakdown(metadata, output_dir) if not success: return False, msg return (True, 'success') except Exception as e: raise e return (False, 'Exception encountered while rendering graphs: {}'.format( render_exception(e)))
def parse_data_file(experiment_name, model, config, specific_params, path_prefix, cmd_id=0, baseline_params=None): """ Given an experiment name, model name, directory, and number of inputs, parses the corresponding data file if it exists and computes summary statistics for the (wall-clock) time, GPU time, and memory used in that data file for choice of specific settings baseline_params: If the command is a ratio command, this will use the baseline to compute the slowdown per data point in order to better measure its distribution. Returns None and an error message if it fails """ try: report_errors = config['report_errors'] metrics, budget, msg = collect_raw_measurements( experiment_name, model, specific_params, path_prefix, cmd_id) if metrics is None: return (None, msg) if budget is not None and specific_params.get('kind') == 'ratio': specific_params['memory_budget'] = float(budget) summary = {'specific_params': specific_params} # in case everything errored out, this ensure that we will have a record of the error if report_errors: if check_error(experiment_name, model, specific_params, path_prefix): summary['summary'] = 'error' return summary, 'success' # if this was a ratio experiment # and we have a baseline available, let's compute # the slowdown per data point, head to head # and bootstrap confidence intervals if (specific_params.get('type') != 'baseline' and specific_params.get('kind') == 'ratio' and baseline_params is not None): baseline_metrics, _, baseline_msg = collect_raw_measurements( baseline_params['type'], model, baseline_params['specific_params'], path_prefix, baseline_params['cmd_id']) if baseline_metrics is None: return (None, baseline_msg) # compute slowdown in metrics for i in range(config['n_inputs']): dtr_times = metrics[i]['gpu_time'] baseline_times = baseline_metrics[i]['gpu_time'] assert len(dtr_times) == len(baseline_times) metrics[i]['slowdown'] = compute_slowdowns( dtr_times, baseline_times) # Compute throughputs for baseline param_sweep commands if specific_params.get('kind') == 'param_sweep' or specific_params.get( 'type') == 'baseline': for i in range(config['n_inputs']): metrics[i]['throughput'] = compute_throughputs( specific_params['batch_size'], metrics[i]['gpu_time']) summary_stats = [] for (_, stat) in metrics.items(): summary_dict = { key: compute_summary_stats(stat[key], bootstrap=('time' in key)) for key in MEASURED_KEYS } if 'slowdown' in stat: summary_dict['slowdown'] = compute_summary_stats( stat['slowdown'], bootstrap=True) if 'throughput' in stat: summary_dict['throughput'] = compute_summary_stats( stat['throughput'], bootstrap=True) summary_stats.append(summary_dict) summary['summary'] = summary_stats return (summary, 'success') except Exception as e: return (None, 'Encountered exception on ({}, {}): '.format( experiment_name, model) + render_exception(e))