Esempio n. 1
0
 def train_loop_fn(model, loader, device, context):
     loss_fn = nn.CrossEntropyLoss()
     optimizer = context.getattr_or(
         'optimizer', lambda: optim.SGD(model.parameters(),
                                        lr=FLAGS.lr,
                                        momentum=FLAGS.momentum,
                                        weight_decay=5e-4))
     lr_scheduler = context.getattr_or(
         'lr_scheduler', lambda: schedulers.wrap_optimizer_with_scheduler(
             optimizer,
             scheduler_type=getattr(FLAGS, 'lr_scheduler_type', None),
             scheduler_divisor=getattr(FLAGS, 'lr_scheduler_divisor', None),
             scheduler_divide_every_n_epochs=getattr(
                 FLAGS, 'lr_scheduler_divide_every_n_epochs', None),
             num_steps_per_epoch=num_training_steps_per_epoch,
             summary_writer=writer if xm.is_master_ordinal() else None))
     tracker = xm.RateTracker()
     model.train()
     for x, (data, target) in loader:
         optimizer.zero_grad()
         output = model(data)
         loss = loss_fn(output, target)
         loss.backward()
         xm.optimizer_step(optimizer)
         tracker.add(FLAGS.batch_size)
         if x % FLAGS.log_steps == 0:
             test_utils.print_training_update(device, x, loss.item(),
                                              tracker.rate(),
                                              tracker.global_rate())
         if lr_scheduler:
             lr_scheduler.step()
Esempio n. 2
0
    def train_loop_fn(model, loader, device, context):
        loss_fn = nn.CrossEntropyLoss()
        optimizer = context.getattr_or(
            'optimizer', lambda: optim.SGD(model.parameters(),
                                           lr=FLAGS.lr,
                                           momentum=FLAGS.momentum,
                                           weight_decay=5e-4))

        # LR scheduler
        scheduler = context.getattr_or(
            'scheduler',
            lambda: CosineAnnealingRestartsLR(optimizer, T=2, eta_min=1e-4))

        model.train()
        for x, (data, target) in loader:
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            xm.optimizer_step(optimizer)
            if x % FLAGS.log_steps == 0:
                print('[{}]({}) Loss={:.5f}'.format(device, x, loss.item()))

        # Step LR scheduler
        scheduler.step()
Esempio n. 3
0
    def train_loop_fn(model, loader, device, context):
        relation_network = model
        #relation_network.apply(weights_init)

        relation_network_optim = torch.optim.Adam(
            relation_network.parameters(), lr=LEARNING_RATE)
        relation_network_scheduler = StepLR(relation_network_optim,
                                            step_size=100000,
                                            gamma=0.5)
        mse = nn.MSELoss()
        tracker = xm.RateTracker()

        for x, (samples, sample_labels, batches, batch_labels) in loader:

            relation_network_scheduler.step(episode)

            relation_network.zero_grad()
            #relation_network_optim.zero_grad()
            relation_scores = relation_network(Variable(samples),
                                               Variable(batches))
            relations = relation_scores.view(-1, CLASS_NUM)
            one_hot_labels = Variable(
                torch.zeros(QUERY_NUM_PER_CLASS * CLASS_NUM,
                            CLASS_NUM).scatter_(1, batch_labels.view(-1, 1),
                                                1))
            loss = mse(relations, one_hot_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(relation_network.parameters(), 0.5)
            xm.optimizer_step(relation_network_optim)
            tracker.add(FLAGS.batch_size)
            print('Debug: ', x, loss.item())
            if x % FLAGS.log_steps == 0:
                print('[{}]({}) Loss={:.5f} Rate={:.2f}'.format(
                    device, x, loss.item(), tracker.rate()))
Esempio n. 4
0
def train_loop_fn(model, loader, device='cpu?', context=None):
    criterion = task.build_criterion(args)
    tracker = xm.RateTracker()
    optimizer = build_optimizer(args, model)
    for i, samples in loader:
        print("Processing minibatch:%d" % i)
        task.train_step(samples[0], model, criterion, optimizer, False)
        xm.optimizer_step(optimizer)
Esempio n. 5
0
  def train_loop_fn(loader):
    tracker = xm.RateTracker()

    model.train()
    for x, (data, target) in loader:
      optimizer.zero_grad()
      output = model(data)
      loss = loss_fn(output, target)
      loss.backward()
      xm.optimizer_step(optimizer)
      tracker.add(FLAGS.batch_size)
      if x % FLAGS.log_steps == 0:
        test_utils.print_training_update(device, x, loss.item(), tracker.rate(),
                                         tracker.global_rate())
Esempio n. 6
0
    def train_loop_fn(model, loader, device, context):
        loss_fn = nn.NLLLoss()
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
        tracker = xm.RateTracker()

        for x, (data, target) in loader:
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            xm.optimizer_step(optimizer)
            tracker.add(FLAGS.batch_size)
            print('[{}]({}) Loss={:.5f} Rate={:.2f}'.format(
                device, x, loss.item(), tracker.rate()))
Esempio n. 7
0
    def tpu_training_loop(model, loader, device, context):
        """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch"""
        model.zero_grad()
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        param_optimizer = list(model.named_parameters())

        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        optimizer = context.getattr_or(
            'optimizer',
            BertAdam(optimizer_grouped_parameters,
                     lr=args.learning_rate,
                     warmup=args.warmup_proportion,
                     t_total=num_train_steps))
        tr_loss = None
        pbar = None
        if str(pbar_device) == str(device):
            pbar = tqdm(total=int(pbar_steps),
                        desc=f"training",
                        dynamic_ncols=True)
        tracker = tpu_xm.RateTracker()
        model.train()
        for step, batch in enumerate(loader):
            input_ids, input_mask, segment_ids, label_ids, pos_ids = batch
            loss, _ = model(input_ids,
                            segment_ids,
                            input_mask,
                            label_ids,
                            pos_ids=pos_ids)
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tracker.add(args.train_batch_size)
            tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps
            if pbar is not None:
                pbar.update(1)
            tpu_xm.optimizer_step(optimizer)
            # optimizer.step()
            optimizer.zero_grad()
        return tr_loss.item() / step
Esempio n. 8
0
 def train_loop_fn(model, loader, device, context):
   loss_fn = nn.CrossEntropyLoss()
   optimizer = optim.SGD(
       model.parameters(),
       lr=FLAGS.lr,
       momentum=FLAGS.momentum,
       weight_decay=5e-4)
   tracker = xm.RateTracker()
   for x, (data, target) in loader:
     optimizer.zero_grad()
     output = model(data)
     loss = loss_fn(output, target)
     loss.backward()
     xm.optimizer_step(optimizer)
     tracker.add(FLAGS.batch_size)
     if x % FLAGS.log_steps == 0:
       print('[{}]({}) Loss={:.5f} Rate={:.2f}'.format(device, x, loss.item(),
                                                       tracker.rate()))
Esempio n. 9
0
 def train_loop_fn(model, loader, device, context):
   trainer = trainers[str(device)]
   stats = None
   tracker = xm.RateTracker()
   for i, samples in loader:
     if i and not (i % args.log_steps):
       print(
           log_step(
               'training',
               device,
               i,
               tracker=tracker,
               metrics_debug=args.metrics_debug))
     _log_output = trainer.train_step(samples)
     xm.optimizer_step(trainer.optimizer)
     tracker.add(len(samples) * args.max_sentences)  # n_batches * batch_size
   stats = fairseq_train.get_training_stats(trainer)
   return tracker, stats
Esempio n. 10
0
    def train_loop_fn(model, loader, device, context):
        loss_fn = nn.NLLLoss()
        optimizer = context.getattr_or(
            'optimizer', lambda: optim.SGD(
                model.parameters(), lr=lr, momentum=FLAGS.momentum))
        tracker = xm.RateTracker()

        model.train()
        for x, (data, target) in loader:
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            xm.optimizer_step(optimizer)
            tracker.add(FLAGS.batch_size)
            if x % FLAGS.log_steps == 0:
                test_utils.print_training_update(device, x, loss.item(),
                                                 tracker.rate(),
                                                 tracker.global_rate())
Esempio n. 11
0
    def loop_fn(model, loader, device, context):
      loss_fn = nn.NLLLoss()
      optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

      for x, (data, target) in loader:
        with xu.TimedScope(msg='Training loop: ', printfn=None):
          optimizer.zero_grad()
          output = xu.timed(lambda: model(data), msg='Model: ', printfn=None)
          loss = xu.timed(
              lambda: loss_fn(output, target), msg='Loss: ', printfn=None)
          xu.timed(loss.backward, msg='LossBkw: ', printfn=None)
          xu.timed(
              lambda: xm.optimizer_step(optimizer), msg='Step: ', printfn=None)
          self.assertLess(loss.cpu().item(), 3.0)
Esempio n. 12
0
    def _train_one_epoch(model, loader, device, context):
        """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch"""

        # model parameters
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch
        optimizer = context.getattr_or(
            'optimizer',
            AdamW(optimizer_grouped_parameters,
                  lr=args.learning_rate,
                  eps=args.adam_epsilon,
                  betas=tuple(args.betas)))
        scheduler = context.getattr_or(
            'scheduler',
            WarmupLinearSchedule(optimizer,
                                 warmup_steps=warmup_updates,
                                 t_total=total_num_updates))

        # restart
        # TODO: scheduler reset to 0 each epoch
        scheduler.step(args.scheduler_last_epoch)
        logging.info(f'Restarting scheduler LR to: {scheduler.get_last_lr()}')

        tr_loss = None
        tracker = tpu_xm.RateTracker()

        model.train()
        for step, batch in loader:
            input_ids, input_mask, segment_ids, lm_label_ids, _ = batch
            outputs = model(input_ids, segment_ids, input_mask, lm_label_ids)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tracker.add(args.per_tpu_train_batch_size)

            tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps
            if (step + 1) % args.gradient_accumulation_steps == 0:
                tpu_xm.optimizer_step(optimizer)
                scheduler.step()
                optimizer.zero_grad()
                # logging.info(f'  Adjusted scheduler LR to {scheduler.get_last_lr()}')

        # since checkpointing happens each epoch, we only need to save the scheduler state at end of each epoch
        logging.info(f'Scheduler last_epoch {scheduler.last_epoch}')

        return tr_loss.item(
        ) / step  # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
Esempio n. 13
0
    def tpu_training_loop(model, loader, device, context):
        """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch"""

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch
        optimizer = context.getattr_or(
            'optimizer',
            AdamW(optimizer_grouped_parameters,
                  lr=args.learning_rate,
                  eps=args.adam_epsilon,
                  betas=tuple(args.betas)))

        # derive warmup info
        if args.warmup_proportion is not None:
            warmup_steps = int(args.warmup_proportion *
                               num_train_optimization_steps + 0.5)
        elif args.warmup_steps is not None:
            warmup_steps = args.warmup_steps
        else:
            raise Exception(
                'What is the warmup?? Specify either warmup proportion or steps'
            )
        scheduler = context.getattr_or(
            'scheduler',
            WarmupLinearSchedule(optimizer,
                                 warmup_steps=warmup_steps,
                                 t_total=num_train_optimization_steps))

        tr_loss = None
        pbar = None
        if str(pbar_device) == str(
                device
        ):  # All threads are in sync. Use progress bar only on one of them
            pbar = tqdm(total=int(pbar_steps),
                        desc=f"device {device}",
                        dynamic_ncols=True)

        tracker = tpu_xm.RateTracker()

        model.train()
        for step, batch in loader:
            input_ids, input_mask, segment_ids, lm_label_ids, _ = batch
            outputs = model(input_ids, segment_ids, input_mask, lm_label_ids)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tracker.add(args.train_batch_size)

            tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps
            if pbar is not None:
                pbar.update(1)
                # pbar.set_description(desc=f'LR: {scheduler.get_lr()}')
            if (step + 1) % args.gradient_accumulation_steps == 0:
                tpu_xm.optimizer_step(optimizer)
                prev_lr = scheduler.get_last_lr()[0]
                scheduler.step()
                curr_lr = scheduler.get_last_lr()[0]
                if args.track_learning_rate:
                    if pbar is not None:
                        pbar.set_description(
                            f"Prev LR: {prev_lr} Curr LR: {curr_lr}")
                optimizer.zero_grad()

        return tr_loss.item(
        ) / step  # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
Esempio n. 14
0
def train(model, data_loader, device, context):

    step = 0
    train_batch_size = args.train_batch
    lr_ratio = args.cls_lr_ratio
    base_lr = args.base_lr

    optimizer = context.getattr_or(
        'optimizer',
        lambda: optim.SGD([{
            'params': base_params
        }, {
            'params': model.classifier.parameters(),
            'lr': lr_ratio * base_lr
        }, {
            'params': model.classifier_swap.parameters(),
            'lr': lr_ratio * base_lr
        }, {
            'params': model.Convmask.parameters(),
            'lr': lr_ratio * base_lr
        }],
                          lr=base_lr,
                          momentum=0.9))

    tracker = xm.RateTracker()

    model.train(True)

    for batch_cnt, data in enumerate(data_loader):
        step += 1
        loss = 0
        model.train(True)

        inputs, labels, labels_swap, swap_law, img_names = data[1]
        inputs = Variable(inputs)
        labels_1 = Variable(torch.from_numpy(np.array(labels)))
        labels_swap_1 = Variable(torch.from_numpy(np.array(labels_swap)))
        swap_law_1 = Variable(torch.from_numpy(np.array(swap_law)).float())
        # relocate tensor from cpu to tpu
        labels = labels_1.to(device)
        labels_swap = labels_swap_1.to(device)
        swap_law = swap_law_1.to(device)

        optimizer.zero_grad()

        if inputs.size(0) < 2 * train_batch_size:
            outputs = model(inputs, inputs[0:-1:2])
        else:
            outputs = model(inputs, None)

        # calculate loss: alpha*ce_loss + beta*swap_loss + gamma*law_loss
        # ce_loss: classification loss
        # swap_loss: adversarial loss
        # law_loss: loc loss

        alpha_ = 1
        beta_ = 1
        gamma_ = 0.01 if Config.dataset == 'STCAR' or Config.dataset == 'AIR' else 1

        add_loss = nn.L1Loss()
        get_ce_loss = nn.CrossEntropyLoss()

        ce_loss = get_ce_loss(outputs[0], labels) * alpha_
        loss += ce_loss
        swap_loss = get_ce_loss(outputs[1], labels_swap) * beta_
        loss += swap_loss
        law_loss = add_loss(outputs[2], swap_law) * gamma_
        loss += law_loss

        loss.backward()
        xm.optimizer_step(optimizer)
        tracker.add(train_batch_size)

        print(
            '[{}] step: {:-8d} / {:d} loss=ce_loss+swap_loss+law_loss: {:6.4f} = {:6.4f} + {:6.4f} + {:6.4f} '
            .format(device, step, train_epoch_step,
                    loss.detach().item(),
                    ce_loss.detach().item(),
                    swap_loss.detach().item(),
                    law_loss.detach().item()),
            flush=True)