def train(model, dataloaders, opts): # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) scaler = GradScaler() global_step = 0 if opts.rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps, desc=opts.model) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results'), exist_ok=True) # store val predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {opts.n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(dataloaders['train'].dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 best_ckpt = 0 best_eval = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(dataloaders['train']): targets = batch['targets'] del batch['gather_index'] n_examples += targets.size(0) with autocast(): loss = model(**batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 scaler.scale(loss).backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss losses = all_gather_list(running_loss) running_loss = RunningMeter( 'loss', sum(l.val for l in losses) / len(losses)) TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: # Unscales the gradients of optimizer's assigned params in-place scaler.unscale_(optimizer) grad_norm = clip_grad_norm_(model.parameters(), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) # scaler.step() first unscales gradients of the optimizer's params. # If gradients don't contain infs/NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{opts.model}: {n_epoch}-{global_step}: ' f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s ' f'best_acc-{best_eval * 100:.2f}') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: log = evaluation( model, dict( filter(lambda x: x[0].startswith('val'), dataloaders.items())), opts, global_step) log_eval = log['val/acc'] if log_eval > best_eval: best_ckpt = global_step best_eval = log_eval pbar.set_description( f'{opts.model}: {n_epoch}-{best_ckpt} best_acc-{best_eval * 100:.2f}' ) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") # if n_epoch >= opts.num_train_epochs: # break return best_ckpt
def main(opts): device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank opts.size = hvd.size() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # data loaders DatasetCls = DATA_REGISTRY[opts.dataset_cls] EvalDatasetCls = DATA_REGISTRY[opts.eval_dataset_cls] splits, dataloaders = create_dataloaders(DatasetCls, EvalDatasetCls, opts) # Prepare model model = build_model(opts) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) scaler = GradScaler() global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps, desc=opts.model) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results'), exist_ok=True) # store val predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(dataloaders['train'].dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 best_ckpt = 0 best_eval = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(dataloaders['train']): targets = batch['targets'] del batch['gather_index'] n_examples += targets.size(0) with autocast(): original_loss, enlarged_loss = model(**batch, compute_loss=True) if opts.candidates == 'original': loss = original_loss elif opts.candidates == 'enlarged': loss = enlarged_loss elif opts.candidates == 'combined': loss = original_loss + enlarged_loss else: raise AssertionError("No such loss!") loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 scaler.scale(loss).backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss losses = all_gather_list(running_loss) running_loss = RunningMeter( 'loss', sum(l.val for l in losses) / len(losses)) TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: # Unscales the gradients of optimizer's assigned params in-place scaler.unscale_(optimizer) grad_norm = clip_grad_norm_(model.parameters(), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) # scaler.step() first unscales gradients of the optimizer's params. # If gradients don't contain infs/NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{opts.model}: {n_epoch}-{global_step}: ' f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s ' f'best_acc-{best_eval * 100:.2f}') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: log = evaluation( model, dict( filter(lambda x: x[0].startswith('val'), dataloaders.items())), opts, global_step) if log['val/acc'] > best_eval: best_ckpt = global_step best_eval = log['val/acc'] pbar.set_description( f'{opts.model}: {n_epoch}-{best_ckpt} best_acc-{best_eval * 100:.2f}' ) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") sum(all_gather_list(opts.rank)) best_pt = f'{opts.output_dir}/ckpt/model_step_{best_ckpt}.pt' model.load_state_dict(torch.load(best_pt), strict=False) evaluation(model, dict(filter(lambda x: x[0] != 'train', dataloaders.items())), opts, best_ckpt)