def train_loop_fn(model, loader, device, context): loss_fn = nn.CrossEntropyLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD(model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=5e-4)) lr_scheduler = context.getattr_or( 'lr_scheduler', lambda: schedulers.wrap_optimizer_with_scheduler( optimizer, scheduler_type=getattr(FLAGS, 'lr_scheduler_type', None), scheduler_divisor=getattr(FLAGS, 'lr_scheduler_divisor', None), scheduler_divide_every_n_epochs=getattr( FLAGS, 'lr_scheduler_divide_every_n_epochs', None), num_steps_per_epoch=num_training_steps_per_epoch, summary_writer=writer if xm.is_master_ordinal() else None)) tracker = xm.RateTracker() model.train() for x, (data, target) in loader: optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: test_utils.print_training_update(device, x, loss.item(), tracker.rate(), tracker.global_rate()) if lr_scheduler: lr_scheduler.step()
def train_loop_fn(model, loader, device, context): loss_fn = nn.CrossEntropyLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD(model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=5e-4)) # LR scheduler scheduler = context.getattr_or( 'scheduler', lambda: CosineAnnealingRestartsLR(optimizer, T=2, eta_min=1e-4)) model.train() for x, (data, target) in loader: optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) if x % FLAGS.log_steps == 0: print('[{}]({}) Loss={:.5f}'.format(device, x, loss.item())) # Step LR scheduler scheduler.step()
def train_loop_fn(model, loader, device, context): relation_network = model #relation_network.apply(weights_init) relation_network_optim = torch.optim.Adam( relation_network.parameters(), lr=LEARNING_RATE) relation_network_scheduler = StepLR(relation_network_optim, step_size=100000, gamma=0.5) mse = nn.MSELoss() tracker = xm.RateTracker() for x, (samples, sample_labels, batches, batch_labels) in loader: relation_network_scheduler.step(episode) relation_network.zero_grad() #relation_network_optim.zero_grad() relation_scores = relation_network(Variable(samples), Variable(batches)) relations = relation_scores.view(-1, CLASS_NUM) one_hot_labels = Variable( torch.zeros(QUERY_NUM_PER_CLASS * CLASS_NUM, CLASS_NUM).scatter_(1, batch_labels.view(-1, 1), 1)) loss = mse(relations, one_hot_labels) loss.backward() torch.nn.utils.clip_grad_norm_(relation_network.parameters(), 0.5) xm.optimizer_step(relation_network_optim) tracker.add(FLAGS.batch_size) print('Debug: ', x, loss.item()) if x % FLAGS.log_steps == 0: print('[{}]({}) Loss={:.5f} Rate={:.2f}'.format( device, x, loss.item(), tracker.rate()))
def train_loop_fn(model, loader, device='cpu?', context=None): criterion = task.build_criterion(args) tracker = xm.RateTracker() optimizer = build_optimizer(args, model) for i, samples in loader: print("Processing minibatch:%d" % i) task.train_step(samples[0], model, criterion, optimizer, False) xm.optimizer_step(optimizer)
def train_loop_fn(loader): tracker = xm.RateTracker() model.train() for x, (data, target) in loader: optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: test_utils.print_training_update(device, x, loss.item(), tracker.rate(), tracker.global_rate())
def train_loop_fn(model, loader, device, context): loss_fn = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) tracker = xm.RateTracker() for x, (data, target) in loader: optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) print('[{}]({}) Loss={:.5f} Rate={:.2f}'.format( device, x, loss.item(), tracker.rate()))
def tpu_training_loop(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" model.zero_grad() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = context.getattr_or( 'optimizer', BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps)) tr_loss = None pbar = None if str(pbar_device) == str(device): pbar = tqdm(total=int(pbar_steps), desc=f"training", dynamic_ncols=True) tracker = tpu_xm.RateTracker() model.train() for step, batch in enumerate(loader): input_ids, input_mask, segment_ids, label_ids, pos_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids, pos_ids=pos_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if pbar is not None: pbar.update(1) tpu_xm.optimizer_step(optimizer) # optimizer.step() optimizer.zero_grad() return tr_loss.item() / step
def train_loop_fn(model, loader, device, context): loss_fn = nn.CrossEntropyLoss() optimizer = optim.SGD( model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=5e-4) tracker = xm.RateTracker() for x, (data, target) in loader: optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: print('[{}]({}) Loss={:.5f} Rate={:.2f}'.format(device, x, loss.item(), tracker.rate()))
def train_loop_fn(model, loader, device, context): trainer = trainers[str(device)] stats = None tracker = xm.RateTracker() for i, samples in loader: if i and not (i % args.log_steps): print( log_step( 'training', device, i, tracker=tracker, metrics_debug=args.metrics_debug)) _log_output = trainer.train_step(samples) xm.optimizer_step(trainer.optimizer) tracker.add(len(samples) * args.max_sentences) # n_batches * batch_size stats = fairseq_train.get_training_stats(trainer) return tracker, stats
def train_loop_fn(model, loader, device, context): loss_fn = nn.NLLLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD( model.parameters(), lr=lr, momentum=FLAGS.momentum)) tracker = xm.RateTracker() model.train() for x, (data, target) in loader: optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: test_utils.print_training_update(device, x, loss.item(), tracker.rate(), tracker.global_rate())
def loop_fn(model, loader, device, context): loss_fn = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) for x, (data, target) in loader: with xu.TimedScope(msg='Training loop: ', printfn=None): optimizer.zero_grad() output = xu.timed(lambda: model(data), msg='Model: ', printfn=None) loss = xu.timed( lambda: loss_fn(output, target), msg='Loss: ', printfn=None) xu.timed(loss.backward, msg='LossBkw: ', printfn=None) xu.timed( lambda: xm.optimizer_step(optimizer), msg='Step: ', printfn=None) self.assertLess(loss.cpu().item(), 3.0)
def _train_one_epoch(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" # model parameters param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_updates, t_total=total_num_updates)) # restart # TODO: scheduler reset to 0 each epoch scheduler.step(args.scheduler_last_epoch) logging.info(f'Restarting scheduler LR to: {scheduler.get_last_lr()}') tr_loss = None tracker = tpu_xm.RateTracker() model.train() for step, batch in loader: input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.per_tpu_train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) scheduler.step() optimizer.zero_grad() # logging.info(f' Adjusted scheduler LR to {scheduler.get_last_lr()}') # since checkpointing happens each epoch, we only need to save the scheduler state at end of each epoch logging.info(f'Scheduler last_epoch {scheduler.last_epoch}') return tr_loss.item( ) / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
def tpu_training_loop(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) # derive warmup info if args.warmup_proportion is not None: warmup_steps = int(args.warmup_proportion * num_train_optimization_steps + 0.5) elif args.warmup_steps is not None: warmup_steps = args.warmup_steps else: raise Exception( 'What is the warmup?? Specify either warmup proportion or steps' ) scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)) tr_loss = None pbar = None if str(pbar_device) == str( device ): # All threads are in sync. Use progress bar only on one of them pbar = tqdm(total=int(pbar_steps), desc=f"device {device}", dynamic_ncols=True) tracker = tpu_xm.RateTracker() model.train() for step, batch in loader: input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if pbar is not None: pbar.update(1) # pbar.set_description(desc=f'LR: {scheduler.get_lr()}') if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) prev_lr = scheduler.get_last_lr()[0] scheduler.step() curr_lr = scheduler.get_last_lr()[0] if args.track_learning_rate: if pbar is not None: pbar.set_description( f"Prev LR: {prev_lr} Curr LR: {curr_lr}") optimizer.zero_grad() return tr_loss.item( ) / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
def train(model, data_loader, device, context): step = 0 train_batch_size = args.train_batch lr_ratio = args.cls_lr_ratio base_lr = args.base_lr optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD([{ 'params': base_params }, { 'params': model.classifier.parameters(), 'lr': lr_ratio * base_lr }, { 'params': model.classifier_swap.parameters(), 'lr': lr_ratio * base_lr }, { 'params': model.Convmask.parameters(), 'lr': lr_ratio * base_lr }], lr=base_lr, momentum=0.9)) tracker = xm.RateTracker() model.train(True) for batch_cnt, data in enumerate(data_loader): step += 1 loss = 0 model.train(True) inputs, labels, labels_swap, swap_law, img_names = data[1] inputs = Variable(inputs) labels_1 = Variable(torch.from_numpy(np.array(labels))) labels_swap_1 = Variable(torch.from_numpy(np.array(labels_swap))) swap_law_1 = Variable(torch.from_numpy(np.array(swap_law)).float()) # relocate tensor from cpu to tpu labels = labels_1.to(device) labels_swap = labels_swap_1.to(device) swap_law = swap_law_1.to(device) optimizer.zero_grad() if inputs.size(0) < 2 * train_batch_size: outputs = model(inputs, inputs[0:-1:2]) else: outputs = model(inputs, None) # calculate loss: alpha*ce_loss + beta*swap_loss + gamma*law_loss # ce_loss: classification loss # swap_loss: adversarial loss # law_loss: loc loss alpha_ = 1 beta_ = 1 gamma_ = 0.01 if Config.dataset == 'STCAR' or Config.dataset == 'AIR' else 1 add_loss = nn.L1Loss() get_ce_loss = nn.CrossEntropyLoss() ce_loss = get_ce_loss(outputs[0], labels) * alpha_ loss += ce_loss swap_loss = get_ce_loss(outputs[1], labels_swap) * beta_ loss += swap_loss law_loss = add_loss(outputs[2], swap_law) * gamma_ loss += law_loss loss.backward() xm.optimizer_step(optimizer) tracker.add(train_batch_size) print( '[{}] step: {:-8d} / {:d} loss=ce_loss+swap_loss+law_loss: {:6.4f} = {:6.4f} + {:6.4f} + {:6.4f} ' .format(device, step, train_epoch_step, loss.detach().item(), ce_loss.detach().item(), swap_loss.detach().item(), law_loss.detach().item()), flush=True)