コード例 #1
0
    def timing_loop(model_name, i, dry_run, n_reps, config, use_dtr,
                    specific_params, extra_params, results_queue,
                    heartbeat_queue):
        measurements = []
        print(f'Running {model_name} : {specific_params}')

        # remove any logs hanging around (so we only have to look for one)
        delete_logs()

        # we only save logs for the final input on DTR
        save_log = use_dtr and specific_params.get(
            'save_logs', config['save_logs']) and i == config['n_inputs'] - 1
        if use_dtr:
            torch.toggle_log(False)

        batch_size = specific_params['batch_size']
        use_profiling = use_dtr and specific_params.get('use_profiling', False)
        use_cudnn = model_util.use_cudnn(model_name)

        with torch.backends.cudnn.flags(enabled=use_cudnn,
                                        benchmark=use_cudnn):
            produce_model, gen_input, run_model, teardown = model_util.prepare_model(
                model_name, batch_size, use_dtr=use_dtr)
            criterion = model_util.get_criterion(model_name)
            inp = gen_input(i, extra_params)

            if use_profiling:
                torch.toggle_profile(use_profiling)

            progress = tqdm(range(dry_run + n_reps))
            for j in progress:
                progress.set_description(
                    f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]')
                gc.collect()
                # Annotate where the final run starts in the log
                if save_log and j == dry_run + n_reps - 1:
                    torch.toggle_log(True)
                    torch.annotate_log('START')

                try:
                    res = run_single_measurement(model_name, produce_model,
                                                 run_model, teardown, inp,
                                                 criterion, extra_params,
                                                 use_dtr, use_profiling)
                except RuntimeError as e:
                    heartbeat_queue.put((False, 0))
                    raise e
                heartbeat_queue.put((True, res["time"]))
                if j >= dry_run:
                    results_queue.put(res)
コード例 #2
0
def run_single_measurement(model_name, produce_model, run_model, teardown, inp,
                           criterion, extra_params, use_dtr, use_profiling):
    """
    This function initializes a model and performs
    a single measurement of the model on the given input.

    While it might seem most reasonable to initialize
    the model outside of the loop, DTR's logs have shown
    that certain constants in the model persist between loop iterations;
    performing these actions in a separate *function scope* turned out to be the only
    way to prevent having those constants hang around.

    Returns a dict of measurements
    """
    torch.cuda.reset_max_memory_allocated()
    # resetting means the count should be reset to
    # only what's in scope, meaning only the input
    input_mem = torch.cuda.max_memory_allocated()
    model = produce_model(extra_params=extra_params)
    params = []
    for m in model:
        if hasattr(m, 'parameters'):
            params.extend(m.parameters())

    model_mem = torch.cuda.max_memory_allocated()

    optimizer = torch.optim.SGD(model[0].parameters(),
                                1e-3,
                                momentum=0.9,
                                weight_decay=1e-4)

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    # start timing
    torch.cuda.synchronize()
    start_time = time.time()
    if use_dtr:
        torch.reset_profile()
    start.record()
    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    run_model(criterion, *model, *inp, optimizer=optimizer)
    end.record()
    start_sync = time.time()
    torch.cuda.synchronize()
    end_sync = time.time()
    end_time = time.time()
    # end timing

    if use_dtr:
        # operators-only time, tracked by DTR
        cuda_time = torch.compute_time()

    base_compute_time = -1
    remat_compute_time = -1
    search_time = -1
    cost_time = -1
    if use_profiling:
        base_compute_time = torch.base_compute_time()
        remat_compute_time = torch.remat_compute_time()
        search_time = torch.search_time()
        cost_time = torch.cost_time()
        torch.reset_profile()

    total_mem = torch.cuda.max_memory_allocated()
    teardown(*model)
    torch.cuda.reset_max_memory_allocated()

    del model

    if use_dtr:
        torch.toggle_log(False)

    del params

    batch_size = len(inp[0])
    ips = batch_size / (end_time - start_time)

    result = {
        'time': end_time - start_time,
        'sync_time': end_sync - start_sync,
        'gpu_time': start.elapsed_time(end),
        'input_mem': input_mem,
        'model_mem': model_mem,
        'total_mem': total_mem,
        'base_compute_time': base_compute_time,
        'remat_compute_time': remat_compute_time,
        'search_time': search_time,
        'cost_time': cost_time,
        'batch_size': batch_size,
        'ips': ips
    }
    if use_dtr:
        result['cuda_time'] = cuda_time
    else:
        result['cuda_time'] = -1.0

    return result
コード例 #3
0
def timing_loop(model_name,
                i,
                config,
                use_dtr,
                specific_params,
                writer,
                trial_run=False,
                trial_run_outfile=None,
                memory_budget=-1.0):
    dry_run = config['dry_run']
    measurements = []
    print(f'Running {model_name} : {specific_params}')

    # remove any logs hanging around (so we only have to look for one)
    delete_logs()

    # we only save logs for the final input on DTR
    save_log = use_dtr and specific_params.get(
        'save_logs', config['save_logs']) and i == config['n_inputs'] - 1
    if use_dtr:
        torch.toggle_log(False)

    # whether to report profiling info
    use_profiling = use_dtr and specific_params.get('use_profiling', False)

    use_cudnn = model_util.use_cudnn(model_name)
    with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn):
        criterion = model_util.get_criterion(model_name)
        produce_model, gen_input, run_model, teardown = model_util.prepare_model(
            model_name, specific_params['batch_size'], use_dtr=use_dtr)
        inp = gen_input(i, specific_params.get('extra_params', dict()))

        n_reps = specific_params.get('n_reps', config['n_reps'])

        if use_profiling:
            torch.toggle_profile(use_profiling)

        progress = tqdm(range(dry_run + n_reps))
        for j in progress:
            progress.set_description(f'Rep [{j}]' +
                                     '' if j > dry_run else f'Dry run [{j}]')
            gc.collect()
            # Annotate where the final run starts in the log
            if save_log and j == dry_run + n_reps - 1:
                torch.toggle_log(True)
                torch.annotate_log('START')

            res = run_single_measurement(model_name,
                                         produce_model,
                                         run_model,
                                         teardown,
                                         inp,
                                         criterion,
                                         extra_params=specific_params.get(
                                             'extra_params', dict()),
                                         use_dtr=use_dtr,
                                         use_profiling=use_profiling)
            if j >= dry_run:
                measurements.append(res)

        # Dump results
        model_name_replace_dict = {
            'tv_resnet152': 'resnet152',
            'tv_resnet50': 'resnet50',
        }

        train_ips_list = []
        batch_size = None
        for res in measurements:
            batch_size = res['batch_size']
            train_ips_list.append(res['ips'])

        out_file = "speed_results.tsv"
        with open(out_file, "a") as fout:
            val_dict = {
                'network': model_name_replace_dict.get(model_name, model_name),
                'algorithm': 'dtr',
                'budget': specific_params['memory_budget'],
                'batch_size': batch_size,
                'ips': np.median(train_ips_list) if train_ips_list else -1,
            }
            print(val_dict)
            fout.write(json.dumps(val_dict) + "\n")
        print(f"save results to {out_file}")

    # write to csv file only when this trial is not
    # for getting a baseline memory usage
    if trial_run:
        write_json(
            os.getcwd(), trial_run_outfile,
            {'mem': max(map(lambda data: data['total_mem'], measurements))})
        return

    if save_log:
        save_trial_log(config['log_dest'],
                       config.get('simrd_config', None),
                       model_name,
                       specific_params,
                       is_baseline=specific_params['memory_budget'] == -1)

    # clean up after ourselves
    delete_logs()

    # do all the writing after the trial is over
    for j in range(len(measurements)):
        data = measurements[j]
        # do unit conversions now: times in ms,
        # memory in MB
        writer.writerow({
            'time': data['time'] * 1e3,
            'sync_time': data['sync_time'] * 1e3,
            # pytorch's cuda elapsed time is already in ms
            'gpu_time': float(data['gpu_time']),
            # 'cuda_time' : float(data['cuda_time']) * 1e-6,
            'input_mem': data['input_mem'] * 1e-6,
            'model_mem': data['model_mem'] * 1e-6,
            'total_mem': data['total_mem'] * 1e-6,
            'memory_budget': memory_budget,
            # profiling (reported in nanoseconds)
            'base_compute_time': data['base_compute_time'] * 1e-6,
            'remat_compute_time': data['remat_compute_time'] * 1e-6,
            'search_time': data['search_time'] * 1e-6,
            'cost_time': data['cost_time'] * 1e-6,
            'rep': j - dry_run,
            'input': i,
            **specific_params
        })
コード例 #4
0
def run_single_measurement(model_name, produce_model, run_model, teardown, inp,
                           criterion, extra_params, use_dtr):
    """
    This function initializes a model and performs
    a single measurement of the model on the given input.

    While it might seem most reasonable to initialize
    the model outside of the loop, DTR's logs have shown
    that certain constants in the model persist between loop iterations;
    performing these actions in a separate *function scope* turned out to be the only
    way to prevent having those constants hang around.

    Returns a tuple (CPU time, GPU time, peak memory usage)
    """
    torch.cuda.reset_max_memory_allocated()
    # resetting means the count should be reset to
    # only what's in scope, meaning only the input
    input_mem = torch.cuda.max_memory_allocated()
    model = produce_model(extra_params=extra_params)
    params = []
    for m in model:
        if hasattr(m, 'parameters'):
            params.extend(m.parameters())

    model_mem = torch.cuda.max_memory_allocated()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    # start timing
    torch.cuda.synchronize()
    start_time = time.time()
    if use_dtr:
        torch.reset_compute_time()
    start.record()
    run_model(criterion, *model, *inp)
    end.record()
    torch.cuda.synchronize()
    end_time = time.time()
    # end timing

    if use_dtr:
        # operators-only time, tracked by DTR
        cuda_time = torch.compute_time()

    total_mem = torch.cuda.max_memory_allocated()
    teardown(*model)
    torch.cuda.reset_max_memory_allocated()

    del model

    if use_dtr:
        torch.toggle_log(False)

    del params

    result = {
        'time': end_time - start_time,
        'gpu_time': start.elapsed_time(end),
        'input_mem': input_mem,
        'model_mem': model_mem,
        'total_mem': total_mem
    }
    if use_dtr:
        result['cuda_time'] = cuda_time
    else:
        result['cuda_time'] = -1.0

    return result
コード例 #5
0
def timing_loop(model_name,
                i,
                config,
                use_dtr,
                specific_params,
                writer,
                trial_run=False,
                trial_run_outfile=None):
    dry_run = config['dry_run']
    measurements = []
    print(f'Running {model_name} : {specific_params}')

    # remove any logs hanging around (so we only have to look for one)
    delete_logs()

    # we only save logs for the final input on DTR
    save_log = use_dtr and config['save_logs'] and i == config['n_inputs'] - 1
    if use_dtr:
        torch.toggle_log(False)

    use_cudnn = model_util.use_cudnn(model_name)
    with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn):
        criterion = model_util.get_criterion(model_name)
        produce_model, gen_input, run_model, teardown = model_util.prepare_model(
            model_name, specific_params['batch_size'], use_dtr=use_dtr)
        inp = gen_input(i, specific_params.get('extra_params', dict()))

        progress = tqdm(range(dry_run + config['n_reps']))
        for j in progress:
            progress.set_description(f'Rep [{j}]' +
                                     '' if j > dry_run else f'Dry run [{j}]')
            gc.collect()
            # Annotate where the final run starts in the log
            if save_log and j == dry_run + config['n_reps'] - 1:
                torch.toggle_log(config['save_logs'])
                torch.annotate_log('START')

            res = run_single_measurement(model_name,
                                         produce_model,
                                         run_model,
                                         teardown,
                                         inp,
                                         criterion,
                                         extra_params=specific_params.get(
                                             'extra_params', dict()),
                                         use_dtr=use_dtr)
            if j >= dry_run:
                measurements.append(res)

    # write to csv file only when this trial is not
    # for getting a baseline memory usage
    if trial_run:
        write_json(
            os.getcwd(), trial_run_outfile,
            {'mem': max(map(lambda data: data['total_mem'], measurements))})
        return

    if save_log:
        save_trial_log(config['log_dest'], model_name, specific_params)

    # clean up after ourselves
    delete_logs()

    # do all the writing after the trial is over
    for j in range(len(measurements)):
        data = measurements[j]
        # do unit conversions now: times in ms,
        # memory in MB
        writer.writerow({
            'time': data['time'] * 1e3,
            # pytorch's cuda elapsed time is already in ms
            'gpu_time': float(data['gpu_time']),
            # 'cuda_time' : float(data['cuda_time']) * 1e-6,
            'input_mem': data['input_mem'] * 1e-6,
            'model_mem': data['model_mem'] * 1e-6,
            'total_mem': data['total_mem'] * 1e-6,
            'rep': j - dry_run,
            'input': i,
            **specific_params
        })