def timing_loop(model_name, i, dry_run, n_reps, config, use_dtr, specific_params, extra_params, results_queue, heartbeat_queue): measurements = [] print(f'Running {model_name} : {specific_params}') # remove any logs hanging around (so we only have to look for one) delete_logs() # we only save logs for the final input on DTR save_log = use_dtr and specific_params.get( 'save_logs', config['save_logs']) and i == config['n_inputs'] - 1 if use_dtr: torch.toggle_log(False) batch_size = specific_params['batch_size'] use_profiling = use_dtr and specific_params.get('use_profiling', False) use_cudnn = model_util.use_cudnn(model_name) with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn): produce_model, gen_input, run_model, teardown = model_util.prepare_model( model_name, batch_size, use_dtr=use_dtr) criterion = model_util.get_criterion(model_name) inp = gen_input(i, extra_params) if use_profiling: torch.toggle_profile(use_profiling) progress = tqdm(range(dry_run + n_reps)) for j in progress: progress.set_description( f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]') gc.collect() # Annotate where the final run starts in the log if save_log and j == dry_run + n_reps - 1: torch.toggle_log(True) torch.annotate_log('START') try: res = run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params, use_dtr, use_profiling) except RuntimeError as e: heartbeat_queue.put((False, 0)) raise e heartbeat_queue.put((True, res["time"])) if j >= dry_run: results_queue.put(res)
def run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params, use_dtr, use_profiling): """ This function initializes a model and performs a single measurement of the model on the given input. While it might seem most reasonable to initialize the model outside of the loop, DTR's logs have shown that certain constants in the model persist between loop iterations; performing these actions in a separate *function scope* turned out to be the only way to prevent having those constants hang around. Returns a dict of measurements """ torch.cuda.reset_max_memory_allocated() # resetting means the count should be reset to # only what's in scope, meaning only the input input_mem = torch.cuda.max_memory_allocated() model = produce_model(extra_params=extra_params) params = [] for m in model: if hasattr(m, 'parameters'): params.extend(m.parameters()) model_mem = torch.cuda.max_memory_allocated() optimizer = torch.optim.SGD(model[0].parameters(), 1e-3, momentum=0.9, weight_decay=1e-4) start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) # start timing torch.cuda.synchronize() start_time = time.time() if use_dtr: torch.reset_profile() start.record() # with torch.autograd.profiler.profile(use_cuda=True) as prof: run_model(criterion, *model, *inp, optimizer=optimizer) end.record() start_sync = time.time() torch.cuda.synchronize() end_sync = time.time() end_time = time.time() # end timing if use_dtr: # operators-only time, tracked by DTR cuda_time = torch.compute_time() base_compute_time = -1 remat_compute_time = -1 search_time = -1 cost_time = -1 if use_profiling: base_compute_time = torch.base_compute_time() remat_compute_time = torch.remat_compute_time() search_time = torch.search_time() cost_time = torch.cost_time() torch.reset_profile() total_mem = torch.cuda.max_memory_allocated() teardown(*model) torch.cuda.reset_max_memory_allocated() del model if use_dtr: torch.toggle_log(False) del params batch_size = len(inp[0]) ips = batch_size / (end_time - start_time) result = { 'time': end_time - start_time, 'sync_time': end_sync - start_sync, 'gpu_time': start.elapsed_time(end), 'input_mem': input_mem, 'model_mem': model_mem, 'total_mem': total_mem, 'base_compute_time': base_compute_time, 'remat_compute_time': remat_compute_time, 'search_time': search_time, 'cost_time': cost_time, 'batch_size': batch_size, 'ips': ips } if use_dtr: result['cuda_time'] = cuda_time else: result['cuda_time'] = -1.0 return result
def timing_loop(model_name, i, config, use_dtr, specific_params, writer, trial_run=False, trial_run_outfile=None, memory_budget=-1.0): dry_run = config['dry_run'] measurements = [] print(f'Running {model_name} : {specific_params}') # remove any logs hanging around (so we only have to look for one) delete_logs() # we only save logs for the final input on DTR save_log = use_dtr and specific_params.get( 'save_logs', config['save_logs']) and i == config['n_inputs'] - 1 if use_dtr: torch.toggle_log(False) # whether to report profiling info use_profiling = use_dtr and specific_params.get('use_profiling', False) use_cudnn = model_util.use_cudnn(model_name) with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn): criterion = model_util.get_criterion(model_name) produce_model, gen_input, run_model, teardown = model_util.prepare_model( model_name, specific_params['batch_size'], use_dtr=use_dtr) inp = gen_input(i, specific_params.get('extra_params', dict())) n_reps = specific_params.get('n_reps', config['n_reps']) if use_profiling: torch.toggle_profile(use_profiling) progress = tqdm(range(dry_run + n_reps)) for j in progress: progress.set_description(f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]') gc.collect() # Annotate where the final run starts in the log if save_log and j == dry_run + n_reps - 1: torch.toggle_log(True) torch.annotate_log('START') res = run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params=specific_params.get( 'extra_params', dict()), use_dtr=use_dtr, use_profiling=use_profiling) if j >= dry_run: measurements.append(res) # Dump results model_name_replace_dict = { 'tv_resnet152': 'resnet152', 'tv_resnet50': 'resnet50', } train_ips_list = [] batch_size = None for res in measurements: batch_size = res['batch_size'] train_ips_list.append(res['ips']) out_file = "speed_results.tsv" with open(out_file, "a") as fout: val_dict = { 'network': model_name_replace_dict.get(model_name, model_name), 'algorithm': 'dtr', 'budget': specific_params['memory_budget'], 'batch_size': batch_size, 'ips': np.median(train_ips_list) if train_ips_list else -1, } print(val_dict) fout.write(json.dumps(val_dict) + "\n") print(f"save results to {out_file}") # write to csv file only when this trial is not # for getting a baseline memory usage if trial_run: write_json( os.getcwd(), trial_run_outfile, {'mem': max(map(lambda data: data['total_mem'], measurements))}) return if save_log: save_trial_log(config['log_dest'], config.get('simrd_config', None), model_name, specific_params, is_baseline=specific_params['memory_budget'] == -1) # clean up after ourselves delete_logs() # do all the writing after the trial is over for j in range(len(measurements)): data = measurements[j] # do unit conversions now: times in ms, # memory in MB writer.writerow({ 'time': data['time'] * 1e3, 'sync_time': data['sync_time'] * 1e3, # pytorch's cuda elapsed time is already in ms 'gpu_time': float(data['gpu_time']), # 'cuda_time' : float(data['cuda_time']) * 1e-6, 'input_mem': data['input_mem'] * 1e-6, 'model_mem': data['model_mem'] * 1e-6, 'total_mem': data['total_mem'] * 1e-6, 'memory_budget': memory_budget, # profiling (reported in nanoseconds) 'base_compute_time': data['base_compute_time'] * 1e-6, 'remat_compute_time': data['remat_compute_time'] * 1e-6, 'search_time': data['search_time'] * 1e-6, 'cost_time': data['cost_time'] * 1e-6, 'rep': j - dry_run, 'input': i, **specific_params })
def run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params, use_dtr): """ This function initializes a model and performs a single measurement of the model on the given input. While it might seem most reasonable to initialize the model outside of the loop, DTR's logs have shown that certain constants in the model persist between loop iterations; performing these actions in a separate *function scope* turned out to be the only way to prevent having those constants hang around. Returns a tuple (CPU time, GPU time, peak memory usage) """ torch.cuda.reset_max_memory_allocated() # resetting means the count should be reset to # only what's in scope, meaning only the input input_mem = torch.cuda.max_memory_allocated() model = produce_model(extra_params=extra_params) params = [] for m in model: if hasattr(m, 'parameters'): params.extend(m.parameters()) model_mem = torch.cuda.max_memory_allocated() start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) # start timing torch.cuda.synchronize() start_time = time.time() if use_dtr: torch.reset_compute_time() start.record() run_model(criterion, *model, *inp) end.record() torch.cuda.synchronize() end_time = time.time() # end timing if use_dtr: # operators-only time, tracked by DTR cuda_time = torch.compute_time() total_mem = torch.cuda.max_memory_allocated() teardown(*model) torch.cuda.reset_max_memory_allocated() del model if use_dtr: torch.toggle_log(False) del params result = { 'time': end_time - start_time, 'gpu_time': start.elapsed_time(end), 'input_mem': input_mem, 'model_mem': model_mem, 'total_mem': total_mem } if use_dtr: result['cuda_time'] = cuda_time else: result['cuda_time'] = -1.0 return result
def timing_loop(model_name, i, config, use_dtr, specific_params, writer, trial_run=False, trial_run_outfile=None): dry_run = config['dry_run'] measurements = [] print(f'Running {model_name} : {specific_params}') # remove any logs hanging around (so we only have to look for one) delete_logs() # we only save logs for the final input on DTR save_log = use_dtr and config['save_logs'] and i == config['n_inputs'] - 1 if use_dtr: torch.toggle_log(False) use_cudnn = model_util.use_cudnn(model_name) with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn): criterion = model_util.get_criterion(model_name) produce_model, gen_input, run_model, teardown = model_util.prepare_model( model_name, specific_params['batch_size'], use_dtr=use_dtr) inp = gen_input(i, specific_params.get('extra_params', dict())) progress = tqdm(range(dry_run + config['n_reps'])) for j in progress: progress.set_description(f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]') gc.collect() # Annotate where the final run starts in the log if save_log and j == dry_run + config['n_reps'] - 1: torch.toggle_log(config['save_logs']) torch.annotate_log('START') res = run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params=specific_params.get( 'extra_params', dict()), use_dtr=use_dtr) if j >= dry_run: measurements.append(res) # write to csv file only when this trial is not # for getting a baseline memory usage if trial_run: write_json( os.getcwd(), trial_run_outfile, {'mem': max(map(lambda data: data['total_mem'], measurements))}) return if save_log: save_trial_log(config['log_dest'], model_name, specific_params) # clean up after ourselves delete_logs() # do all the writing after the trial is over for j in range(len(measurements)): data = measurements[j] # do unit conversions now: times in ms, # memory in MB writer.writerow({ 'time': data['time'] * 1e3, # pytorch's cuda elapsed time is already in ms 'gpu_time': float(data['gpu_time']), # 'cuda_time' : float(data['cuda_time']) * 1e-6, 'input_mem': data['input_mem'] * 1e-6, 'model_mem': data['model_mem'] * 1e-6, 'total_mem': data['total_mem'] * 1e-6, 'rep': j - dry_run, 'input': i, **specific_params })