예제 #1
0
def run_trials(method,
               task_name,
               dry_run,
               times_per_input,
               n_input,
               trial,
               trial_setup,
               trial_teardown,
               parameter_names,
               parameter_ranges,
               path_prefix='',
               append_to_csv=False):
    try:
        filename = os.path.join(path_prefix,
                                '{}-{}.csv'.format(method, task_name))
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))
        mode = 'a' if append_to_csv else 'w'
        with open(filename, mode, newline='') as csvfile:
            fieldnames = parameter_names + ['rep', 'run', 'time']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if not append_to_csv:
                writer.writeheader()

            for args in product(*parameter_ranges):
                costs = []
                for t in range(n_input):
                    score = 0.0
                    try:
                        trial_args = trial_setup(*args)
                        score = _score_loop(t, trial, trial_args, list(args),
                                            times_per_input, dry_run, writer,
                                            fieldnames)
                        trial_teardown(*trial_args)
                    except Exception as e:
                        # can provide more detailed summary if
                        # it happened inside a trial
                        return (
                            False,
                            'Encountered exception in trial on inputs {}:\n'.
                            format(args) + render_exception(e))

                    if t != n_input - 1:
                        time.sleep(4)
                    costs.append(score)

                print(method, task_name, args, ["%.6f" % x for x in costs])
        return (True, 'success')
    except Exception as e:
        return (False, 'Encountered exception:\n' + render_exception(e))
예제 #2
0
    def main(config_dir, output_dir):
        try:
            config, msg = validate_config(config_dir)
            if config is None:
                write_status(output_dir, False, msg)
                return 1

            if check_early_exit is not None:
                early_exit, msg = check_early_exit(config)
                if early_exit:
                    write_status(output_dir, True, msg)
                    return 0

            configure_seed(config)

            if gen_trial_params is None:
                write_status(output_dir, True, 'No trial to run')
                return 0

            trial_params = gen_trial_params(config)
            success, msg = run_trials(*trial_params, path_prefix=output_dir)
            write_status(output_dir, success, msg)
            return 0 if success else 1
        except Exception as e:
            write_status(output_dir, False, render_exception(e))
            return 1
예제 #3
0
def eval_command(model, exp_config, config, config_dir, output_dir, cmd_id):
    try:
        if exp_config.get('kind') == 'ratio':
            success, result = run_baseline(model, exp_config, config,
                                           config_dir, output_dir)
            if not success:
                return False, result

            # the actual memory budget calculation is
            # in `unfold_settings`
            exp_config['memory_budget'] = result

        first_time = True
        conf_cnt = 0
        for combo in unfold_settings(exp_config):
            success, msg = run_trials(config_dir,
                                      python_command(combo['type'], config),
                                      combo['type'],
                                      model,
                                      combo,
                                      config['n_inputs'],
                                      config['n_reps'],
                                      output_dir,
                                      report_errors=config['report_errors'],
                                      append_to_csv=False,
                                      trial_run=False,
                                      cmd_id=cmd_id,
                                      conf_cnt=conf_cnt)
            if not success:
                return False, msg
            conf_cnt += 1
        return True, 'success'
    except Exception as e:
        return (False, 'Encountered outer iteration exception:\n' +
                render_exception(e))
예제 #4
0
def write_generic_summary(data_dir,
                          output_dir,
                          title,
                          devices,
                          networks=None,
                          use_networks=False):
    """
    Given a data directory and output directory, this function writes
    a generic summary assuming that the data has a field keyed by device
    (cpu/gpu) and optionally by network. It writes a summary and status to the output dir.
    """
    try:
        all_data = sort_data(data_dir)
        most_recent = all_data[-1]

        summary = None
        if use_networks:
            summary = summary_by_dev_and_network(most_recent, devices,
                                                 networks)
        else:
            summary = summary_by_dev(most_recent, devices)
        write_summary(output_dir, title, summary)
        write_status(output_dir, True, 'success')

        # TODO do something about comparisons to previous days
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
예제 #5
0
def post_message(client, channel, message, **kargs):
    """
    Attempts posting the given message object to the
    Slack channel.
    Returns whether it was successful and a message.
    """
    try:
        resp = []
        if isinstance(channel, list):
            for ch in channel:
                resp.append(
                    client.chat_postMessage(
                        channel=ch,
                        text=message.get('text', '*No Message Content*'),
                        attachments=message.get('attachments', ''),
                        **kargs))
        else:
            resp.append(
                client.chat_postMessage(
                    channel=channel,
                    text=message.get('text', '*No Message Content*'),
                    attachments=message.get('attachments', ''),
                    **kargs))
        return (True, resp, 'success')
    except Exception as e:
        return (False, None, 'Encountered exception:\n' + render_exception(e))
예제 #6
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        summary = {}
        for model in sorted(config['models']):
            summary[model] = []
            # the script will not be run if there is an error
            cmd_id = 0
            for _, _, exp_config in parse_commands(model, config):
                for combo in unfold_settings(exp_config):
                    stats, msg = parse_data_file(exp_config['type'], model,
                                                 config, combo, data_dir,
                                                 cmd_id)
                    if stats is None:
                        write_status(output_dir, False, msg)
                        return 1
                    stats['command_id'] = cmd_id
                    summary[model].append(stats)
                cmd_id += 1
        write_json(output_dir, 'data.json', summary)
        write_status(output_dir, True, 'success')
    except Exception as e:
        write_status(output_dir, False, render_exception(e))
예제 #7
0
def main(config_dir, setup_dir):
    try:
        export_mxnet_model('rnn', setup_dir)
        export_mxnet_model('gru', setup_dir)
        export_mxnet_model('lstm', setup_dir)
        write_status(setup_dir, True, 'success')
    except Exception as e:
        write_status(setup_dir, False, render_exception(e))
예제 #8
0
def post_message(webhook_url, message):
    """
    Attempts posting the given message object to the
    Slack webhook URL.
    Returns whether it was successful and a message.
    """
    try:
        r = requests.post(webhook_url, json=message)
        return (True, 'success')
    except Exception as e:
        return (False, 'Encountered exception:\n' + render_exception(e))
예제 #9
0
def render_fixed(model_name, output_dir, x_axis, dtr_entries, failed_trials):
    if not (dtr_entries or failed_trials):
        return (True, 'nothing to render')
    filename = prepare_out_file(
        output_dir,
        f'{name_dict.get(model_name, model_name)}-fixed-gpu-time.png')
    try:
        plt.clf()
        plt.style.use('seaborn-paper')
        plt.rcParams["font.size"] = 30
        fig = plt.figure()
        fig.add_subplot(111, frameon=False)
        fig.set_size_inches(12, 7)
        plt.xticks(fontsize=13)
        plt.yticks(fontsize=13)
        plt.xlabel('Memory Budget (MB)', fontsize=15, labelpad=10)
        plt.ylabel(r'Compute Time (ms)', fontsize=15, labelpad=10)
        plt.title(f'{name_dict.get(model_name, model_name)} GPU Time',
                  fontsize=18)
        plt.grid(True)

        ax = plt.gca()
        if dtr_entries:
            lin, = ax.plot(x_axis,
                           dtr_entries,
                           color=color_scheme.get(model_name, 'black'),
                           linewidth=4)
            mk, = ax.plot(x_axis,
                          dtr_entries,
                          label=name_dict.get(model_name, model_name),
                          linewidth=4,
                          marker=marker_scheme.get(model_name, '+'),
                          ms=12,
                          alpha=.6,
                          color=color_scheme.get(model_name, 'black'))
            ax.legend([(lin, mk)], ['merged'])

        if failed_trials:
            plt.axvline(x=max(failed_trials),
                        color=color_scheme.get(model_name, 'black'),
                        linestyle='dashed')

        plt.legend(bbox_to_anchor=(0.5, 0.01),
                   loc='lower center',
                   bbox_transform=fig.transFigure,
                   ncol=7,
                   borderaxespad=0,
                   prop={'size': 15})
        plt.tight_layout()
        plt.savefig(filename, bbox_inches='tight')
        return (True, 'success')
    except Exception as e:
        raise e
        return (False, render_exception(e))
예제 #10
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    pass_spec_name_map = {
        '3;FuseOps': 'Op Fusion',
        '3;FoldConstant|FuseOps': '... + Constant Folding',
        '3;EliminateCommonSubexpr|FoldConstant|FuseOps': '... + Common Subexpr Elim',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldConstant|FuseOps': '... + Parallel Conv Comb',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|FoldConstant|FuseOps': '... + Axis Scale Folding',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|FoldConstant|FuseOps': '... + Cast Canonicalization',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|FoldConstant|FuseOps': '... + Op Canonicalization',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|AlterOpLayout|FoldConstant|FuseOps': '... + Op Layout Alteration'
    }

    prereqs, msg = check_prerequisites(info, {
        'pass_comparison': {
            'networks': networks,
            'passes': [
                parse_combo(combo) for combo in pass_spec_name_map.keys()
            ]
        }
    })

    all_data = sort_data(info.exp_data_dir('pass_comparison'))
    raw_data = all_data[-1]

    baseline = '0;'

    network_name_map = {
        'resnet-18': 'ResNet-18',
        'mobilenet': 'MobileNet V2',
        'nature-dqn': 'DQN',
        'vgg-16': 'VGG-16'
    }

    del raw_data['timestamp']
    del raw_data['tvm_hash']

    try:
        for (dev, raw_dev_data) in raw_data.items():
            plot_data = OrderedDict([
                (pass_spec_name_map[pass_spec], {
                    network_name_map[network]:
                    raw_dev_data[baseline][network] / raw_dev_data[pass_spec][network]
                    for network in networks})
                for pass_spec in pass_spec_name_map.keys()
            ])
            generate_pass_comparisons(plot_data, output_dir, f'pass-comp-{dev}.png')
    except Exception as e:
        write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #11
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)
    our_name = 'Relay'
    if 'our_name' in conf:
        our_name = conf['our_name']

    conf_fws = ['relay', 'pt', 'tf', 'mxnet', 'nnvm']
    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    prereqs, msg = check_prerequisites(
        info, {
            'cnn_comp': {
                'devices': ['gpu'],
                'use_xla': True,
                'networks': networks,
                'frameworks': conf_fws
            }
        })
    if not prereqs:
        write_status(output_dir, False, msg)
        return 1

    all_data = sort_data(info.exp_data_dir('cnn_comp'))
    raw_data = all_data[-1]['gpu']

    our_fw = 'Relay'
    other_fws = ['TensorFlow', 'Pytorch', 'MxNet', 'NNVM', 'TF XLA']
    fw_name_map = {fw: fw for fw in other_fws}
    fw_name_map['Pytorch'] = 'PyTorch'

    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    network_name_map = {
        'resnet-18': 'ResNet-18',
        'mobilenet': 'MobileNet V2',
        'nature-dqn': 'DQN',
        'vgg-16': 'VGG-16'
    }

    plot_data = OrderedDict([(fw_name_map[fw], {
        network_name_map[network]:
        raw_data[fw][network] / raw_data[our_fw][network]
        for network in networks
    }) for fw in other_fws])

    try:
        generate_vision_comparisons(our_name, plot_data, output_dir)
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #12
0
def render_field(model_name,
                 output_dir,
                 title,
                 filename,
                 x_label,
                 y_label,
                 x_axis,
                 baseline_entries,
                 dtr_entries,
                 failed_trials,
                 confidence=None,
                 suptitle=''):
    if not (dtr_entries or baseline_entries or failed_trials):
        return (True, 'nothing to render')
    file = prepare_out_file(output_dir, filename)
    try:
        # min_x = min(*(x_axis + failed_trials))
        # max_x = max(*(x_axis + failed_trials))
        ax = plt.gca()
        if dtr_entries:
            lin, = ax.plot(x_axis,
                           dtr_entries,
                           color=COLOR_SCHEME.get(model_name, 'black'),
                           linewidth=4)
            mk, = ax.plot(x_axis,
                          dtr_entries,
                          label=NAME_DICT.get(model_name, model_name),
                          linewidth=4,
                          marker=MARKER_SCHEME.get(model_name, '+'),
                          ms=12,
                          alpha=.6,
                          color=COLOR_SCHEME.get(model_name, 'black'))
            if confidence:
                render_errorbars(ax, x_axis, dtr_entries, confidence)
            ax.legend([(lin, mk)], ['merged'])
        # if baseline_entries:
        #     plt.hlines(y=baseline_entries[0], xmin=min_x, xmax=max_x, linewidth=3,
        #                label='Baseline', color='blue', linestyles='dashed')

        if failed_trials:
            plt.axvline(x=max(failed_trials),
                        color=COLOR_SCHEME.get(model_name, 'black'),
                        linestyle='dashed')

        # fig = plt.legend().figure
        # fig.savefig(file)
        return (True, 'success')
    except Exception as e:
        raise e
        return (False,
                'Exception encountered while rendering graph: {}'.format(
                    render_exception(e)))
예제 #13
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate_trials_config(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        summary = {}

        baseline_dict = {}

        for model in sorted(config['models']):
            summary[model] = []
            baseline_dict[model] = {}
            # the script will not be run if there is an error
            cmd_id = 0
            for _, _, exp_config in parse_commands(model, config):
                baseline_params = None
                for specific_params in unfold_settings(exp_config):
                    batch_size = specific_params['batch_size']
                    if specific_params['type'] == 'baseline':
                        baseline_dict[model][batch_size] = {
                            'type': 'baseline',
                            'specific_params': specific_params,
                            'cmd_id': cmd_id
                        }

                    # if there is a corresponding baseline,
                    # let's match using the dict
                    baseline_params = None
                    if (batch_size in baseline_dict[model]
                            and specific_params['type'] != 'baseline'):
                        baseline_params = baseline_dict[model][batch_size]

                    stats, msg = parse_data_file(
                        exp_config['type'],
                        model,
                        config,
                        specific_params,
                        data_dir,
                        cmd_id,
                        baseline_params=baseline_params)
                    if stats is None:
                        write_status(output_dir, False, msg)
                        return 1
                    stats['command_id'] = cmd_id
                    summary[model].append(stats)
                cmd_id += 1
        write_json(output_dir, 'data.json', summary)
        write_status(output_dir, True, 'success')
    except Exception as e:
        write_status(output_dir, False, render_exception(e))
예제 #14
0
def process_score(info, score_metric, data_dir, graph_dir, timestamp):
    data = score_metric.compute_score(info)
    data['timestamp'] = timestamp
    write_json(data_dir, 'data_{}.json'.format(timestamp), data)

    # graphs failing is not a fatal error, just an inconvenience
    try:
        score_metric.score_graph(data, graph_dir)
        all_data = sort_data(data_dir)
        score_metric.longitudinal_graphs(all_data, graph_dir)
    except Exception as e:
        print(render_exception(e))
    finally:
        return score_metric.score_text(data)
예제 #15
0
def upload_image(client, channels, file_path, description, **kargs):
    """
    Attempts to upload an image to a channel
    """
    try:
        if isinstance(channels, list):
            channels = ','.join(channels)
        resp = client.files_upload(channels=channels,
                                   file=file_path,
                                   title=description,
                                   **kargs)
        return (True, resp, 'success')
    except Exception as e:
        return (False, None, 'Encountered exception:\n' + render_exception(e))
예제 #16
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate_trials_config(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        all_data = sort_data(data_dir)
        most_recent = all_data[-1]
        success, msg = render_graph(config, most_recent, output_dir)
        write_status(output_dir, success, msg)
    except Exception as e:
        write_status(output_dir, False, 'Exception encountered: ' + render_exception(e))
        return 1
    finally:
        plt.close()
예제 #17
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)

    data_dir = os.path.join(output_dir, 'data')
    graph_dir = os.path.join(output_dir, 'graphs')
    idemp_mkdir(data_dir)
    idemp_mkdir(graph_dir)

    timestamp = get_timestamp()

    score_confs = conf['score_confs']
    metrics = set(score_confs.keys())
    metrics = metrics.intersection(set(SCORE_METRICS.keys()))

    if not metrics:
        write_status(output_dir, True, 'No scores to report')
        return 0

    score_data = {}
    score_reports = {}
    for metric in metrics:
        score_metric = SCORE_METRICS[metric](score_confs[metric])
        valid, msg = check_prerequisites(info, score_metric.prereq())
        if not valid:
            write_status(output_dir, False, msg)
            return 1

        score_data_dir = os.path.join(data_dir, metric)
        score_graph_dir = os.path.join(graph_dir, metric)
        idemp_mkdir(score_data_dir)
        idemp_mkdir(score_graph_dir)

        try:
            report = process_score(info, score_metric, score_data_dir,
                                   score_graph_dir, timestamp)
            score_reports[metric] = report
        except Exception as e:
            write_status(
                output_dir, False,
                'Encountered exception while scoring {}:\n{}'.format(
                    metric, render_exception(e)))
            return 1

    report = {'title': 'Metric Scores', 'value': format_scores(score_reports)}
    write_json(output_dir, 'report.json', report)
    write_status(output_dir, True, 'success')
예제 #18
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate_trials_config(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        all_data = sort_data(data_dir)
        most_recent = all_data[-1]

        summary = summarize(config, most_recent)
        write_summary(output_dir, 'Pareto Curve Trial', summary)
        write_status(output_dir, True, 'success')

    except Exception as e:
        write_status(output_dir, False, 'Exception encountered: ' + render_exception(e))
        return 1
예제 #19
0
def main(data_dir, config_dir, output_dir):
    config, msg = validate(config_dir)
    if config is None:
        write_status(output_dir, False, msg)
        return 1

    # read in data, output graphs of most recent data, and output longitudinal graphs
    all_data = sort_data(data_dir)
    most_recent = all_data[-1]

    try:
        generate_longitudinal_comparisons(all_data, output_dir)
        generate_arm_vta_comparisons(most_recent, output_dir)
    except Exception as e:
        write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #20
0
def trials_stat_summary(data_dir, framework, task_name, num_reps,
                        parameter_names, params_to_match):
    """
    Returns a full summary of statistics on the specified framework
    and task across all reps where the specified parameters match.

    Returns (summary, success, message)
    """
    try:
        data = obtain_data_rows(data_dir, framework, task_name,
                                parameter_names, params_to_match)
        summary = summarize_over_reps(data, num_reps)
        return (summary, True, 'success')
    except Exception as e:
        return (-1, False,
                'Encountered exception on {}, {} using params {}:\n{}'.format(
                    framework, task_name, params_to_match,
                    render_exception(e)))
예제 #21
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    prereqs, msg = check_prerequisites(
        info, {
            'relay_opt': {
                'devices': ['gpu'],
                'opt_levels': [0, 1, 2, 3, 4],
                'networks': networks
            }
        })
    if not prereqs:
        write_status(output_dir, False, msg)
        return 1

    all_data = sort_data(info.exp_data_dir('relay_opt'))
    raw_data = all_data[-1]['gpu']

    baseline = 'O0'
    opts = ['O1', 'O2', 'O3', 'O4']

    network_name_map = {
        'resnet-18': 'ResNet-18',
        'mobilenet': 'MobileNet V2',
        'nature-dqn': 'DQN',
        'vgg-16': 'VGG-16'
    }

    plot_data = OrderedDict([(opt, {
        network_name_map[network]:
        raw_data[baseline][network] / raw_data[opt][network]
        for network in networks
    }) for opt in opts])

    try:
        generate_opt_comparisons(plot_data, output_dir)
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #22
0
def eval_command(model, exp_config, config, config_dir, output_dir, cmd_id):
    try:
        if exp_config.get('kind') == 'ratio':
            success, result = run_baseline(model, exp_config, config, config_dir, output_dir)
            if not success:
                return False, result

            # the actual memory budget calculation is
            # in `unfold_settings`
            exp_config['memory_budget'] = result

            # if there is a sampling cutoff and it's given as a ratio,
            # convert it to a budget cutoff
            if 'no_sampling_below_ratio' in exp_config:
                threshold_ratio = exp_config['no_sampling_below_ratio']
                if threshold_ratio != -1:
                    exp_config['no_sampling_below_budget'] = threshold_ratio*result

        conf_cnt = 0
        for combo in unfold_settings(exp_config):
            success, msg = run_trials(config_dir,
                                      python_command(combo['type'], config),
                                      combo['type'], model, combo,
                                      config['n_inputs'],
                                      output_dir,
                                      report_errors=config['report_errors'],
                                      append_to_csv=False,
                                      trial_run=False,
                                      cmd_id=cmd_id,
                                      conf_cnt=conf_cnt,
                                      sync_gpu=config['sync_gpu'])
            if not success:
                return False, msg
            conf_cnt += 1
        return True, 'success'
    except Exception as e:
        return (False,
                'Encountered outer iteration exception:\n' + render_exception(e))
예제 #23
0
    def main(data_dir, config_dir, output_dir):
        try:
            config, msg = validate_config(config_dir)
            if config is None:
                write_status(output_dir, False, msg)
                return 1

            all_data = sort_data(data_dir)
            most_recent = all_data[-1]
            last_two_weeks = [
                entry for entry in all_data
                if time_difference(most_recent, entry).days < 14
            ]

            generate_longitudinal_comparisons(all_data, output_dir, 'all_time')
            generate_longitudinal_comparisons(last_two_weeks, output_dir,
                                              'two_weeks')
            generate_individual_comparisons(config, most_recent, output_dir)
        except Exception as e:
            write_status(output_dir, False,
                         'Exception encountered:\n' + render_exception(e))
            return 1

        write_status(output_dir, True, 'success')
예제 #24
0
def render_fixed(ax,
                 model_name,
                 output_dir,
                 x_axis,
                 dtr_entries,
                 baseline_data,
                 failed_trials,
                 batch_size=None,
                 confidence=None):
    if not (dtr_entries or failed_trials):
        return (True, 'nothing to render')
    filename = prepare_out_file(
        output_dir,
        f'{NAME_DICT.get(model_name, model_name)}-fixed-gpu-time.png')
    try:
        # plt.style.use('seaborn-paper')
        # plt.rcParams["font.size"] = 30
        # fig = plt.figure()
        # fig.add_subplot(111, frameon=False)
        # fig.set_size_inches(12, 7)
        # plt.xticks(fontsize=13)
        # plt.yticks(fontsize=13)
        # plt.xlabel('Memory Budget (MB)', fontsize=15, labelpad=10)
        # plt.ylabel(r'Compute Time (ms)', fontsize=15, labelpad=10)
        # plt.title(f'{NAME_DICT.get(model_name, model_name)} GPU Time', fontsize=18)
        # plt.grid(True)

        # ax = plt.gca()
        width = 0.0
        all_axis = sorted(x_axis + failed_trials)
        ind = np.arange(len(all_axis) + 1)
        ind_index = dict(zip(all_axis, ind))
        ind_pos = dict([(ind[i], i) for i in range(len(ind))])
        ax.set_xticks(ind + width / 2)
        ax.set_xticklabels(
            map(lambda x: f'{round(x * 1e-9, 1)}',
                all_axis + [baseline_data['mem'] * 1e+6]))

        ax.tick_params(axis='both', labelsize=20)

        filtered_entries = []

        if baseline_data and 'cpu_time' in baseline_data:
            for (x, datum) in zip(x_axis, dtr_entries):
                if not datum.get(
                        'error', False) and 'cpu_time' in datum and datum[
                            'cpu_time'] > 3 * baseline_data['cpu_time']:
                    failed_trials.append(x)
                    filtered_entries.append({key: 0 for key in datum.keys()})
                else:
                    filtered_entries.append(datum)

        dtr_entries = filtered_entries

        if failed_trials:
            for x in failed_trials:
                ax.axvline(x=ind_index[x],
                           color='red',
                           linestyle='dashed',
                           label='OOM')
        new_ind = []
        for x in x_axis:
            new_ind.append(ind_index[x])
        new_ind.append(ind[-1])
        ind = np.array(new_ind)
        ax.grid(True, axis='y')
        ax.set_title(
            f'{NAME_DICT.get(model_name, model_name)} ({batch_size})\n{input_sizes.get(model_name, "")}',
            fontsize=15)

        for x in failed_trials:
            ax.bar(ind_index[x], 0)
        if dtr_entries:
            # lin, = ax.plot(x_axis, dtr_entries, color=COLOR_SCHEME.get(model_name, 'black'), linewidth=4)
            # mk,  = ax.plot(x_axis, dtr_entries, label=NAME_DICT.get(model_name, model_name),
            #               linewidth=4, marker=MARKER_SCHEME.get(model_name, '+'), ms=12,
            #               alpha=.6, color=COLOR_SCHEME.get(model_name, 'black'))
            data_collection = {key: [] for key in timed_keys}
            data_collection['dispatch_overhead'] = []
            for entry in dtr_entries:
                acc = 0
                for (k, v) in entry.items():
                    if k != 'cpu_time':
                        data_collection[k].append(v)
                        acc += v
                data_collection['dispatch_overhead'].append(entry['cpu_time'] -
                                                            acc)

            acc = np.zeros(len(x_axis))
            for k in timed_keys + ['dispatch_overhead']:
                # print(ind[:-1], data_collection[k])
                ax.bar(ind[:-1],
                       data_collection[k],
                       label=breakdown_namedict.get(k, k),
                       color=breakdown_color_scheme.get(k, 'red'),
                       bottom=acc)
                acc = acc + data_collection[k]

            if baseline_data and 'cpu_time' in baseline_data:
                ax.bar([ind[-1]],
                       baseline_data['cpu_time'],
                       label='Unmodified\nPyTorch',
                       color='blue')
            else:
                ax.bar([ind[-1]], 0, label='Unmodified PyTorch', color='blue')
                ax.axvline(ind[-1],
                           color='red',
                           linestyle='dashed',
                           label='OOM')

            if confidence and False:
                render_errorbars(ax, x_axis, dtr_entries, confidence)

            ax.invert_xaxis()
            # ax.legend([(lin, mk)], ['merged'])

            # plt.legend(
        #         bbox_to_anchor=(0.5,0.01),
        #         loc='lower center',
        #         bbox_transform=fig.transFigure,
        #         ncol=7,
        #         borderaxespad=0,
        #         prop={'size': 15}
        #     )
        # plt.tight_layout()
        # plt.savefig(filename, bbox_inches = 'tight')
        return (True, 'success')
    except Exception as e:
        raise e
        return (False, render_exception(e))
예제 #25
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)
    our_name = 'Relay'
    if 'our_name' in conf:
        our_name = conf['our_name']

    prereqs, msg = check_prerequisites(
        info, {
            'treelstm': {
                'devices': ['cpu'],
                'frameworks': ['relay', 'pt'],
                'relay_methods': ['aot']
            },
            'char_rnn': {
                'devices': ['cpu'],
                'frameworks': ['relay', 'pt'],
                'relay_methods': ['aot'],
                'relay_configs': ['loop']
            },
            'gluon_rnns': {
                'devices': ['cpu'],
                'frameworks': ['relay', 'mxnet'],
                'networks': ['rnn', 'lstm', 'gru'],
                'relay_methods': ['aot']
            }
        })
    if not prereqs:
        write_status(output_dir, False, msg)
        return 1

    raw_data = {}
    for exp in ['treelstm', 'char_rnn', 'gluon_rnns']:
        all_data = sort_data(info.exp_data_dir(exp))
        raw_data[exp] = all_data[-1]

    plot_data = OrderedDict([
        ('MxNet', {
            'RNN':
            raw_data['gluon_rnns']['cpu']['MxNet']['rnn'] /
            raw_data['gluon_rnns']['cpu']['Aot']['rnn'],
            'GRU':
            raw_data['gluon_rnns']['cpu']['MxNet']['gru'] /
            raw_data['gluon_rnns']['cpu']['Aot']['gru'],
            'LSTM':
            raw_data['gluon_rnns']['cpu']['MxNet']['lstm'] /
            raw_data['gluon_rnns']['cpu']['Aot']['lstm'],
            'CharRNN':
            0.0,
            'TreeLSTM':
            0.0,
        }),
        ('PyTorch', {
            'RNN':
            0.0,
            'GRU':
            0.0,
            'LSTM':
            0.0,
            'CharRNN':
            raw_data['char_rnn']['cpu']['Pytorch'] /
            raw_data['char_rnn']['cpu']['Aot'],
            'TreeLSTM':
            raw_data['treelstm']['cpu']['Pytorch'] /
            raw_data['treelstm']['cpu']['Aot'],
        }),
    ])

    try:
        generate_nlp_comparisons(our_name, plot_data, output_dir)
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #26
0
def render_throughput_breakdown(metadata, output_dir):
    throughput_metadata = {}

    # Gather data to render
    # a mapping that has the type model -> exp_type -> batch_size -> data dict
    def get_throughput_metadata(model, batch_size, dtr_dict, baseline_dict,
                                output_dir):
        if model not in throughput_metadata:
            throughput_metadata[model] = {'dtr': {}, 'baseline': {}}
        throughput_metadata[model]['dtr'][batch_size] = []
        for datum in dtr_dict[batch_size]['param_sweep']:
            throughput_metadata[model]['dtr'][batch_size].append({
                'memory_budget':
                datum.get('memory_budget', -1),
                'error':
                datum['error'],
                **{key: datum.get(key)
                   for key in used_keys}
            })

        if batch_size in baseline_dict:
            throughput_metadata[model]['baseline'][batch_size] = {
                key: baseline_dict[batch_size][key]
                for key in used_keys
            }
        else:
            throughput_metadata[model]['baseline'][batch_size] = {
                key: 0
                for key in used_keys
            }
        return True, 'success'

    traverse_field(metadata, 'param_sweep', get_throughput_metadata,
                   output_dir)

    flip = lambda f: lambda x: lambda y: f(y, x)

    # Plot throughput and time breakdown of a model
    def plot_model(model):
        filename = prepare_out_file(output_dir,
                                    f'throughput-comparison-{model}.png')
        plt.clf()
        plt.grid(True)
        plt.title(f'Throughput Comparison of {NAME_DICT.get(model, model)}')
        plt.xlabel('Batch Size', fontsize=15, labelpad=10)
        plt.ylabel('Throughput (Batch Size / Avg GPU Time (s))')
        num_batch_size = len(throughput_metadata[model]['dtr'].keys())
        baseline_data = metadata[model]['baseline']
        width = 0.15
        ind = np.arange(num_batch_size)
        x_axis = list(sorted(throughput_metadata[model]['dtr'].keys()))

        # Wish we had currying !!!
        # If baseline data does not contain a batch size, then we fill 0 into the data, since it means baseline failed (OOMed)
        baseline_data = list(
            map(flip(throughput_metadata[model]['baseline'].get)(0), x_axis))

        # Bar for baseline
        plt.bar(ind, [datum['throughput'] for datum in baseline_data],
                width,
                label='Baseline')
        dtr_data = {'throughput': {}, 'breakdown': {}}

        # Gather information collected
        # the structure of dtr_data:
        # Level 0: 'breakdown'      | 'throughput'
        # Level 1: data dictionary  | computed throughput (float)
        # Level 3: same as dictionaries processed in fill_data
        for x in x_axis:
            for datum in throughput_metadata[model]['dtr'][x]:
                if datum['memory_budget'] not in dtr_data['throughput']:
                    dtr_data['throughput'][datum['memory_budget']] = []
                    dtr_data['breakdown'][datum['memory_budget']] = []
                dtr_data['throughput'][datum['memory_budget']].append(
                    datum['throughput'] if not datum['error'] else 0)
                dtr_data['breakdown'][datum['memory_budget']].append(
                    dict(filter(lambda x: x[0] != 'throughput', datum.items())
                         ) if not datum['error'] else None)

        num_budget = len(dtr_data['throughput'].keys())
        plt.xticks(ind + width * (num_budget / 2), map(str, x_axis))

        for (i, (budget, throughput)) in enumerate(
                sorted(dtr_data['throughput'].items(), key=lambda x: -x[0])):
            plt.bar(ind + width * (i + 1),
                    throughput,
                    width,
                    label=f'{round(budget * 1e-9, 1)} GiB')

        plt.legend(loc='best')
        plt.tight_layout()
        plt.savefig(filename, bbox_inches='tight')

        # Plot runtime profiling breakdown
        filename = prepare_out_file(output_dir, f'time-breakdown-{model}.png')
        plt.clf()
        plt.title(f'Runtime Breakdown of {NAME_DICT.get(model, model)}')
        plt.xlabel('Batch Size')
        plt.ylabel('Time / Batch (ms)')
        x_ticks_loc = {
            ind[i] + width * (num_budget / 2): '\n\n' + str(x_axis[i])
            for i in range(num_batch_size)
        }
        plt.grid(True, axis='y')
        for (i, (budget, datum)) in enumerate(
                sorted(dtr_data['breakdown'].items(), key=lambda x: -x[0])):
            locs = ind + width * (i + 1)
            for loc in locs:
                x_tick = f'{round(budget * 1e-9, 1)}\nGiB'
                if loc in x_ticks_loc.keys():
                    x_tick += f'\n{x_ticks_loc[loc]}'
                x_ticks_loc[loc] = x_tick

            if datum is None:
                continue
            gathered_data = {key: [] for key in (timed_keys + ['cpu_time'])}
            gathered_data['dispatch_overhead'] = []
            for e in datum:
                time_acc = 0
                for key in gathered_data.keys():
                    if key != 'dispatch_overhead':
                        if e is None:
                            gathered_data[key].append(0)
                        else:
                            gathered_data[key].append(e[key])
                        if key != 'cpu_time' and e is not None:
                            time_acc += e[key]
                if e is not None:
                    gathered_data['dispatch_overhead'].append(
                        gathered_data['cpu_time'][-1] - time_acc)
                else:
                    gathered_data['dispatch_overhead'].append(0)

            height_acc = np.zeros(len(datum))
            for key in timed_keys:  # + ['dispatch_overhead']:
                if i == 0:
                    plt.bar(ind + width * (i + 1),
                            gathered_data[key],
                            width=width,
                            label=breakdown_namedict[key],
                            color=breakdown_color_scheme[key],
                            bottom=height_acc)
                else:
                    plt.bar(ind + width * (i + 1),
                            gathered_data[key],
                            width=width,
                            color=breakdown_color_scheme[key],
                            bottom=height_acc)

                height_acc += gathered_data[key]
        xticks_data = list(sorted(x_ticks_loc.items(), key=lambda x: -x[0]))
        ticks = list(map(lambda x: x[0], xticks_data))
        labels = list(map(lambda x: x[1], xticks_data))
        plt.xticks(ticks, labels)
        plt.legend(loc='best')
        plt.tight_layout()
        plt.savefig(filename, bbox_inches='tight')

    try:
        for model in throughput_metadata.keys():
            plot_model(model)
    except Exception as e:
        return False, render_exception(e)

    return True, 'success'
예제 #27
0
def run_trials(config_dir,
               python_cmd,
               experiment_name,
               model_name,
               specific_params,
               n_inputs,
               n_reps,
               path_prefix,
               report_errors=False,
               append_to_csv=False,
               trial_run=False,
               trial_run_outfile='',
               cmd_id=0,
               conf_cnt=0):
    """
    Responsible for recording the time and max memory usage
    from running a model (the user must provide a lambda for
    actually running the model because different kinds of models
    need different kinds of setup and a lambda that generates an
    input for running that model)

    :params:
        trial_run: When set to true, no persistent experiment data will be saved. It is used to
                   run a baseline trial and record how much memory is used then set the memory budget
                   for `ratio` commands of DTR experiments

        trial_run_out_file: the temporary file that stores the memory usage data of the baseline run

        cmd_id: the command id for current model, starting from 0 by default
        conf_cnt: the id of confguration generated from `unfold_settings`; this is used for tracking
                  which exact configuration that caused errors. 
    """
    try:
        cwd = os.getcwd()
        params_file = 'specific_params.json'
        try:
            write_json(cwd, params_file, specific_params)
            if not trial_run:
                filename = prepare_out_file(
                    path_prefix, '{}-{}.csv'.format(
                        get_report_prefix(experiment_name, specific_params,
                                          cmd_id), model_name))
                mode = 'a' if append_to_csv else 'w'
                with open(filename, mode, newline='') as csvfile:
                    writer = create_csv_writer(csvfile, specific_params)
                    if not append_to_csv:
                        writer.writeheader()
            else:
                filename = ''

            shared_dir = os.path.dirname(os.path.abspath(__file__))
            run_script = os.path.join(shared_dir, 'run_torch_trial.py')

            for i in range(n_inputs):
                try:
                    subprocess.run([
                        python_cmd, run_script, '--config-dir', config_dir,
                        '--experiment-mode', experiment_name, '--model-name',
                        model_name, '--input-idx',
                        str(i), '--params-file', params_file, '--out-file',
                        filename, '--trial-run',
                        str(trial_run), '--trial-run-outfile',
                        trial_run_outfile
                    ],
                                   check=True,
                                   timeout=specific_params.get('timeout', 60))
                except (subprocess.CalledProcessError,
                        subprocess.TimeoutExpired) as e:
                    if not report_errors:
                        raise e
                    if trial_run:
                        return (False, 'Baseline failed: {}'.format(
                            render_exception(e)))
                    log_error(experiment_name, model_name, specific_params, i,
                              render_exception(e), path_prefix)
                    return (True, 'successfully caught error')
                time.sleep(4)
            return (True, 'success')
        finally:
            os.remove(params_file)
    except Exception as e:
        return (False, 'Encountered exception on ({}, {}, {}):\n'.format(
            experiment_name, model_name, specific_params) +
                render_exception(e))
예제 #28
0
def parse_data_file(experiment_name,
                    model,
                    config,
                    specific_params,
                    path_prefix,
                    cmd_id=0):
    """
    Given an experiment name, model name, directory, and number of inputs,
    parses the corresponding data file if it exists and computes
    summary statistics for the (wall-clock) time, GPU time, and memory used in that data file for choice of specific settings

    Returns None and an error message if it fails
    """
    try:
        filename = '{}-{}.csv'.format(
            get_report_prefix(experiment_name, specific_params, cmd_id), model)
        if not check_file_exists(path_prefix, filename):
            return (None, 'Data file {} does not exist at {}'.format(
                filename, path_prefix))

        full_path = os.path.join(path_prefix, filename)

        report_errors = config['report_errors']

        metrics = {}

        memory_budget = None

        with open(full_path, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                # In case of there are commands for the same model
                # that have the same values for all configurations
                idx = int(row['input'])
                measured = {key: float(row[key]) for key in MEASURED_KEYS}

                if memory_budget is None and specific_params.get(
                        'kind') == 'ratio':
                    memory_budget = float(row['memory_budget'])
                    specific_params['memory_budget'] = memory_budget

                if idx not in metrics.keys():
                    metrics[idx] = {key: [] for key in MEASURED_KEYS}

                for key in MEASURED_KEYS:
                    metrics[idx][key].append(measured[key])

        summary = {'specific_params': specific_params}

        # in case everything errored out, this ensure that we will have a record of the error
        if report_errors:
            if check_error(experiment_name, model, specific_params,
                           path_prefix):
                summary['summary'] = 'error'
                return summary, 'success'

        summary_stats = []
        for (_, stat) in metrics.items():
            summary_stats.append({
                key: compute_summary_stats(stat[key])
                for key in MEASURED_KEYS
            })

        summary['summary'] = summary_stats
        return (summary, 'success')

    except Exception as e:
        return (None, 'Encountered exception on ({}, {}): '.format(
            experiment_name, model) + render_exception(e))
예제 #29
0
def render_graph(config, data, output_dir):
    try:
        plt.style.use('seaborn-paper')
        plt.rcParams["font.size"] = 30
        fig = plt.figure()
        fig.add_subplot(111, frameon=False)
        fig.set_size_inches(12, 7)
        plt.xticks(fontsize=13)
        plt.yticks(fontsize=13)
        plt.xlabel('Memory Budget (Ratio)', fontsize=15, labelpad=10)
        plt.ylabel(r'Overhead Slow Down ($\times$)', fontsize=15, labelpad=10)
        plt.title('GPU Time Comparisons', fontsize=18)
        plt.grid(True)
        filename = prepare_out_file(output_dir,
                                    f'combined-comparison-ratio.png')

        metadata = {}
        for model in config['models']:
            dtr_dict = {}
            baseline_dict = {}
            stats = data[model]
            for stat in stats:
                if stat['specific_params']['type'] == 'baseline':
                    baseline_dict = fill_data(baseline_dict, stat)
                else:
                    dtr_dict = fill_data(dtr_dict, stat)

            metadata[model] = {'baseline': baseline_dict, 'dtr': dtr_dict}

        success, msg = traverse_field(metadata, 'ratio',
                lambda model, batch_size, dtr_dict, baseline_dict, output_dir:\
                        render_time_comparison(model, batch_size, 'ratio',
                                                dtr_dict[batch_size]['ratio'],
                                                baseline_dict.get(batch_size, {}),
                                                output_dir), output_dir)

        if not success:
            return (False, msg)

        plt.hlines(y=1,
                   xmin=0.0,
                   xmax=1.0,
                   linewidth=3,
                   label='Baseline',
                   color='blue',
                   linestyles='dashed')
        plt.legend(bbox_to_anchor=(0.5, 0.01),
                   loc='lower center',
                   bbox_transform=fig.transFigure,
                   ncol=7,
                   borderaxespad=0,
                   prop={'size': 15})
        plt.tight_layout()
        # plt.savefig(filename, bbox_inches = 'tight')

        plt.clf()
        plt.rcParams["font.size"] = 30

        figure, axs = plt.subplots(2, 4, figsize=(20, 8))
        # figure.set_size_inches(24, 12)
        axs = reversed(flatten(axs))

        success, msg = traverse_field(metadata, 'fixed',
                lambda model, batch_size, dtr_dict, baseline_dict, output_dir:\
                        render_time_comparison(model, batch_size, 'fixed',
                                                dtr_dict[batch_size]['fixed'],
                                                baseline_dict.get(batch_size, {}), output_dir, plt_ax=next(axs)),
                                                output_dir)

        filename = prepare_out_file(output_dir,
                                    'combined-breakdown-comparison.png')
        # figure.tight_layout()
        # plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
        # plt.xlabel('Memory Budget (GiB)')
        # plt.ylabel("Time (ms)")
        figure.text(0.5,
                    0.02,
                    r'\textbf{\Huge Memory Budget (GiB)}',
                    ha='center')
        figure.text(0.09,
                    0.5,
                    r'\textbf{\Huge Time (ms) / Batch}',
                    ha='center',
                    va='center',
                    rotation='vertical')
        plt.legend(bbox_to_anchor=(0.17, 0.075),
                   loc='upper left',
                   bbox_transform=fig.transFigure,
                   ncol=6,
                   borderaxespad=0,
                   prop={'size': 15})
        # figure.tight_layout()
        # plt.tight_layout()
        # plt.tight_layout(h_pad=0.3)
        plt.subplots_adjust(hspace=0.4)
        plt.savefig(filename, bbox_inches='tight', pad_inches=0.4)

        if not success:
            return (False, msg)

        success, msg = render_throughput_breakdown(metadata, output_dir)
        if not success:
            return False, msg
        return (True, 'success')
    except Exception as e:
        raise e
        return (False,
                'Exception encountered while rendering graphs: {}'.format(
                    render_exception(e)))
예제 #30
0
def parse_data_file(experiment_name,
                    model,
                    config,
                    specific_params,
                    path_prefix,
                    cmd_id=0,
                    baseline_params=None):
    """
    Given an experiment name, model name, directory, and number of inputs,
    parses the corresponding data file if it exists and computes
    summary statistics for the (wall-clock) time, GPU time, and memory used in that data file for choice of specific settings

    baseline_params: If the command is a ratio command, this will use
    the baseline to compute the slowdown per data point
    in order to better measure its distribution.

    Returns None and an error message if it fails
    """
    try:
        report_errors = config['report_errors']
        metrics, budget, msg = collect_raw_measurements(
            experiment_name, model, specific_params, path_prefix, cmd_id)
        if metrics is None:
            return (None, msg)

        if budget is not None and specific_params.get('kind') == 'ratio':
            specific_params['memory_budget'] = float(budget)

        summary = {'specific_params': specific_params}

        # in case everything errored out, this ensure that we will have a record of the error
        if report_errors:
            if check_error(experiment_name, model, specific_params,
                           path_prefix):
                summary['summary'] = 'error'
                return summary, 'success'

        # if this was a ratio experiment
        # and we have a baseline available, let's compute
        # the slowdown per data point, head to head
        # and bootstrap confidence intervals
        if (specific_params.get('type') != 'baseline'
                and specific_params.get('kind') == 'ratio'
                and baseline_params is not None):

            baseline_metrics, _, baseline_msg = collect_raw_measurements(
                baseline_params['type'], model,
                baseline_params['specific_params'], path_prefix,
                baseline_params['cmd_id'])
            if baseline_metrics is None:
                return (None, baseline_msg)

            # compute slowdown in metrics
            for i in range(config['n_inputs']):
                dtr_times = metrics[i]['gpu_time']
                baseline_times = baseline_metrics[i]['gpu_time']
                assert len(dtr_times) == len(baseline_times)
                metrics[i]['slowdown'] = compute_slowdowns(
                    dtr_times, baseline_times)

        # Compute throughputs for baseline param_sweep commands
        if specific_params.get('kind') == 'param_sweep' or specific_params.get(
                'type') == 'baseline':
            for i in range(config['n_inputs']):
                metrics[i]['throughput'] = compute_throughputs(
                    specific_params['batch_size'], metrics[i]['gpu_time'])

        summary_stats = []
        for (_, stat) in metrics.items():
            summary_dict = {
                key: compute_summary_stats(stat[key],
                                           bootstrap=('time' in key))
                for key in MEASURED_KEYS
            }
            if 'slowdown' in stat:
                summary_dict['slowdown'] = compute_summary_stats(
                    stat['slowdown'], bootstrap=True)

            if 'throughput' in stat:
                summary_dict['throughput'] = compute_summary_stats(
                    stat['throughput'], bootstrap=True)

            summary_stats.append(summary_dict)

        summary['summary'] = summary_stats
        return (summary, 'success')

    except Exception as e:
        return (None, 'Encountered exception on ({}, {}): '.format(
            experiment_name, model) + render_exception(e))