def main(opts): device = torch.device("cuda") set_random_seed(opts.seed) val_dataloaders, _ = create_dataloaders('nlvr2/img_db/nlvr2_dev', opts.val_datasets, False, opts) test_dataloaders, _ = create_dataloaders('nlvr2/img_db/nlvr2_test', opts.test_datasets, False, opts) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = UniterForPretraining.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) model.to(device) model.train() set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) optimizer.zero_grad() optimizer.step() validate(model, val_dataloaders, 'val') validate(model, test_dataloaders, 'test')
def init_model(self): if self.pretrained_model_file: checkpoint = torch.load(self.pretrained_model_file) LOGGER.info('Using pretrained UNITER base model {}'.format( self.pretrained_model_file)) base_model = UniterForPretraining.from_pretrained( self.config['config'], state_dict=checkpoint['model_state_dict'], img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) self.model = MemeUniter( uniter_model=base_model.uniter, hidden_size=base_model.uniter.config.hidden_size, n_classes=self.config['n_classes']) else: self.load_model()
def init_model(self): # pretrained model file is the original pretrained model - load and use this to fine-tune. # If this argument is False, it will load the model file saved by you after fine-tuning if self.pretrained_model_file: checkpoint = torch.load(self.pretrained_model_file) LOGGER.info('Using pretrained UNITER base model {}'.format( self.pretrained_model_file)) base_model = UniterForPretraining.from_pretrained( self.config['config'], state_dict=checkpoint['model_state_dict'], img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) self.model = MemeUniter( uniter_model=base_model.uniter, hidden_size=base_model.uniter.config.hidden_size, n_classes=self.config['n_classes']) else: self.load_model()
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(args.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() all_dbs = [db for datasets in [opts.train_datasets, opts.val_datasets] for dset in datasets for db in dset['db']] tokenizer = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] assert all(tokenizer == json.load(open(f'{db}/meta.json'))['bert'] for db in all_dbs) # build data loaders train_dataloaders, all_img_dbs = create_dataloaders( opts.train_datasets, True, opts) val_dataloaders, _ = create_dataloaders( opts.val_datasets, False, opts, all_img_dbs) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = UniterForPretraining.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) model.to(device) model.train() # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') global_step = 0 LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) # to compute training statistics task2loss = {task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys()} # ITM w/ OT if opts.itm_ot_lambda > 0: for task in train_dataloaders.keys(): if task.startswith('itm'): task2loss[f'{task}_xe'] = RunningMeter(f'loss/{task}_xe') task2loss[f'{task}_ot'] = RunningMeter(f'loss/{task}_ot') task2loss[f'{task}_ot_pos'] = RunningMeter( f'loss/{task}_ot_pos') task2loss[f'{task}_ot_neg'] = RunningMeter( f'loss/{task}_ot_neg') n_examples = defaultdict(int) n_in_units = defaultdict(int) n_loss_units = defaultdict(int) grad_norm = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() for step, (name, batch) in enumerate(meta_loader): # forward pass n_examples[name] += batch['input_ids'].size(0) n_in_units[name] += (batch['attn_masks'] == 1).sum().item() task = name.split('_')[0] loss = model(batch, task=task, compute_loss=True) if task.startswith('itm'): # OT itm_loss, ot_loss = loss n_loss_units[name] += itm_loss.size(0) itm_loss = itm_loss.mean() if ot_loss is not None: ot_pos, ot_neg = ot_loss ot_loss = (ot_pos.sum() - ot_neg.sum() ) / (ot_pos.size(0) + ot_neg.size(0)) # NOTE: be ware of empty tensor ot_pos = ot_pos.mean().item() if not math.isnan(ot_pos): task2loss[f'{name}_ot_pos'](ot_pos) ot_neg = ot_neg.mean().item() if not math.isnan(ot_neg): task2loss[f'{name}_ot_neg'](ot_neg) loss = itm_loss + opts.itm_ot_lambda * ot_loss task2loss[f'{name}_xe'](itm_loss.item()) task2loss[f'{name}_ot'](ot_loss.item()) else: loss = itm_loss else: n_loss_units[name] += loss.size(0) loss = loss.mean() # loss is not normalized in model # backward pass delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[name]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) task2loss[name](loss.item()) # optimizer update and logging if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.log_scaler_dict({ll.name: ll.val for ll in task2loss.values() if ll.val is not None}) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'==============Step {global_step}===============') for t in train_dataloaders.keys(): assert all(tt == t for tt in all_gather_list(t)) tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time()-start)) tot_in = sum(all_gather_list(n_in_units[t])) in_per_sec = int(tot_in / (time()-start)) tot_l = sum(all_gather_list(n_loss_units[t])) l_per_sec = int(tot_l / (time()-start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_in_per_s', in_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_loss_per_s', l_per_sec, global_step) LOGGER.info('===============================================') if global_step % opts.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step % opts.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step)