def run_model(criterion, model, data, target, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): process_model(model) if use_dtr: data = data.checkpoint() target = target.checkpoint() output = model(data) process_output(output) loss = criterion(output, target) process_loss(loss) if optimizer: optimizer.zero_grad() if use_dtr: torch.annotate_log('BACKWARD') loss.backward() # we are not actually using the loss here # but a real training loop would so we have to decheckpoint if use_dtr: data = data.decheckpoint() loss = loss.decheckpoint() target = target.decheckpoint() if optimizer: optimizer.step() # we include these deletions for generating logs; # these will ensure these fields are deallocated # before the end of the log, so anything still live # will be a gradient or weight del data del loss del target
def run_model(criterion, model, data, target, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): process_model(model) if use_dtr: data = data.checkpoint() target = target.checkpoint() output = model(data) process_output(output) loss = criterion(output, target) process_loss(loss) if optimizer: optimizer.zero_grad() if use_dtr: torch.annotate_log('BACKWARD') loss.backward() # we are not actually using the loss here # but a real training loop would so we have to decheckpoint if use_dtr: data = data.decheckpoint() target = target.decheckpoint() loss = loss.decheckpoint() if optimizer: optimizer.step() del data del target del loss del output
def run_model(criterion, model, ltree, linput, rtree, rinput, target, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): process_model(model) if use_dtr: linput = linput.checkpoint() rinput = rinput.checkpoint() target = target.checkpoint() output = model(ltree, linput, rtree, rinput) process_output(output) loss = criterion(output, target) process_loss(loss) if use_dtr: torch.annotate_log('BACKWARD') loss.backward() if use_dtr: loss.decheckpoint() linput.decheckpoint() rinput = rinput.decheckpoint() target = target.decheckpoint() del linput del rinput del target del loss
def run_model(criterion, model, hidden, data, targets, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): process_model(model) if use_dtr: data = data.checkpoint() targets = targets.checkpoint() output, new_hidden = model(data, hidden) process_output(output) loss = criterion(output.view(-1, ntokens), targets) process_loss(loss) if use_dtr: torch.annotate_log('BACKWARD') loss.backward() wlm.main.repackage_hidden(new_hidden) # we are not actually using the loss here # but a real training loop would so we have to decheckpoint if use_dtr: loss = loss.decheckpoint()
def timing_loop(model_name, i, dry_run, n_reps, config, use_dtr, specific_params, extra_params, results_queue, heartbeat_queue): measurements = [] print(f'Running {model_name} : {specific_params}') # remove any logs hanging around (so we only have to look for one) delete_logs() # we only save logs for the final input on DTR save_log = use_dtr and specific_params.get( 'save_logs', config['save_logs']) and i == config['n_inputs'] - 1 if use_dtr: torch.toggle_log(False) batch_size = specific_params['batch_size'] use_profiling = use_dtr and specific_params.get('use_profiling', False) use_cudnn = model_util.use_cudnn(model_name) with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn): produce_model, gen_input, run_model, teardown = model_util.prepare_model( model_name, batch_size, use_dtr=use_dtr) criterion = model_util.get_criterion(model_name) inp = gen_input(i, extra_params) if use_profiling: torch.toggle_profile(use_profiling) progress = tqdm(range(dry_run + n_reps)) for j in progress: progress.set_description( f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]') gc.collect() # Annotate where the final run starts in the log if save_log and j == dry_run + n_reps - 1: torch.toggle_log(True) torch.annotate_log('START') try: res = run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params, use_dtr, use_profiling) except RuntimeError as e: heartbeat_queue.put((False, 0)) raise e heartbeat_queue.put((True, res["time"])) if j >= dry_run: results_queue.put(res)
def run_model(criterion, model, ltree, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): if use_dtr: ltree.map_(lambda x: x.detach().checkpoint()) output = model(ltree) output = torch.sum(output) if use_dtr: torch.annotate_log('BACKWARD') output.backward() if use_dtr: output = output.decheckpoint() ltree.map_(lambda x: x.decheckpoint()) del output del ltree
def run_model(criterion, model, data, targets, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): process_model(model) if use_dtr: data = data.checkpoint() targets = targets.checkpoint() output = model(data) process_output(output) loss = criterion(output.view(-1, ntokens), targets) process_loss(loss) if use_dtr: torch.annotate_log('BACKWARD') loss.backward() del data del targets del loss
def run_model(criterion, model, data, process_model=identity, process_output=identity, process_loss=identity, optimizer=None): # process_model(model) # target = torch.squeeze(target) if use_dtr: data = list(map(lambda x: x.checkpoint(), data)) output = model(data) output = torch.sum(output[-1]) if use_dtr: torch.annotate_log('BACKWARD') output.backward() # we are not actually using the loss here # but a real training loop would so we have to decheckpoint if use_dtr: data = list(map(lambda x: x.decheckpoint(), data)) del output del data
def timing_loop(model_name, i, config, use_dtr, specific_params, writer, trial_run=False, trial_run_outfile=None, memory_budget=-1.0): dry_run = config['dry_run'] measurements = [] print(f'Running {model_name} : {specific_params}') # remove any logs hanging around (so we only have to look for one) delete_logs() # we only save logs for the final input on DTR save_log = use_dtr and specific_params.get( 'save_logs', config['save_logs']) and i == config['n_inputs'] - 1 if use_dtr: torch.toggle_log(False) # whether to report profiling info use_profiling = use_dtr and specific_params.get('use_profiling', False) use_cudnn = model_util.use_cudnn(model_name) with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn): criterion = model_util.get_criterion(model_name) produce_model, gen_input, run_model, teardown = model_util.prepare_model( model_name, specific_params['batch_size'], use_dtr=use_dtr) inp = gen_input(i, specific_params.get('extra_params', dict())) n_reps = specific_params.get('n_reps', config['n_reps']) if use_profiling: torch.toggle_profile(use_profiling) progress = tqdm(range(dry_run + n_reps)) for j in progress: progress.set_description(f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]') gc.collect() # Annotate where the final run starts in the log if save_log and j == dry_run + n_reps - 1: torch.toggle_log(True) torch.annotate_log('START') res = run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params=specific_params.get( 'extra_params', dict()), use_dtr=use_dtr, use_profiling=use_profiling) if j >= dry_run: measurements.append(res) # Dump results model_name_replace_dict = { 'tv_resnet152': 'resnet152', 'tv_resnet50': 'resnet50', } train_ips_list = [] batch_size = None for res in measurements: batch_size = res['batch_size'] train_ips_list.append(res['ips']) out_file = "speed_results.tsv" with open(out_file, "a") as fout: val_dict = { 'network': model_name_replace_dict.get(model_name, model_name), 'algorithm': 'dtr', 'budget': specific_params['memory_budget'], 'batch_size': batch_size, 'ips': np.median(train_ips_list) if train_ips_list else -1, } print(val_dict) fout.write(json.dumps(val_dict) + "\n") print(f"save results to {out_file}") # write to csv file only when this trial is not # for getting a baseline memory usage if trial_run: write_json( os.getcwd(), trial_run_outfile, {'mem': max(map(lambda data: data['total_mem'], measurements))}) return if save_log: save_trial_log(config['log_dest'], config.get('simrd_config', None), model_name, specific_params, is_baseline=specific_params['memory_budget'] == -1) # clean up after ourselves delete_logs() # do all the writing after the trial is over for j in range(len(measurements)): data = measurements[j] # do unit conversions now: times in ms, # memory in MB writer.writerow({ 'time': data['time'] * 1e3, 'sync_time': data['sync_time'] * 1e3, # pytorch's cuda elapsed time is already in ms 'gpu_time': float(data['gpu_time']), # 'cuda_time' : float(data['cuda_time']) * 1e-6, 'input_mem': data['input_mem'] * 1e-6, 'model_mem': data['model_mem'] * 1e-6, 'total_mem': data['total_mem'] * 1e-6, 'memory_budget': memory_budget, # profiling (reported in nanoseconds) 'base_compute_time': data['base_compute_time'] * 1e-6, 'remat_compute_time': data['remat_compute_time'] * 1e-6, 'search_time': data['search_time'] * 1e-6, 'cost_time': data['cost_time'] * 1e-6, 'rep': j - dry_run, 'input': i, **specific_params })
def timing_loop(model_name, i, config, use_dtr, specific_params, writer, trial_run=False, trial_run_outfile=None): dry_run = config['dry_run'] measurements = [] print(f'Running {model_name} : {specific_params}') # remove any logs hanging around (so we only have to look for one) delete_logs() # we only save logs for the final input on DTR save_log = use_dtr and config['save_logs'] and i == config['n_inputs'] - 1 if use_dtr: torch.toggle_log(False) use_cudnn = model_util.use_cudnn(model_name) with torch.backends.cudnn.flags(enabled=use_cudnn, benchmark=use_cudnn): criterion = model_util.get_criterion(model_name) produce_model, gen_input, run_model, teardown = model_util.prepare_model( model_name, specific_params['batch_size'], use_dtr=use_dtr) inp = gen_input(i, specific_params.get('extra_params', dict())) progress = tqdm(range(dry_run + config['n_reps'])) for j in progress: progress.set_description(f'Rep [{j}]' + '' if j > dry_run else f'Dry run [{j}]') gc.collect() # Annotate where the final run starts in the log if save_log and j == dry_run + config['n_reps'] - 1: torch.toggle_log(config['save_logs']) torch.annotate_log('START') res = run_single_measurement(model_name, produce_model, run_model, teardown, inp, criterion, extra_params=specific_params.get( 'extra_params', dict()), use_dtr=use_dtr) if j >= dry_run: measurements.append(res) # write to csv file only when this trial is not # for getting a baseline memory usage if trial_run: write_json( os.getcwd(), trial_run_outfile, {'mem': max(map(lambda data: data['total_mem'], measurements))}) return if save_log: save_trial_log(config['log_dest'], model_name, specific_params) # clean up after ourselves delete_logs() # do all the writing after the trial is over for j in range(len(measurements)): data = measurements[j] # do unit conversions now: times in ms, # memory in MB writer.writerow({ 'time': data['time'] * 1e3, # pytorch's cuda elapsed time is already in ms 'gpu_time': float(data['gpu_time']), # 'cuda_time' : float(data['cuda_time']) * 1e-6, 'input_mem': data['input_mem'] * 1e-6, 'model_mem': data['model_mem'] * 1e-6, 'total_mem': data['total_mem'] * 1e-6, 'rep': j - dry_run, 'input': i, **specific_params })