Ejemplo n.º 1
0
 def build_optimizer(self, epochs, trn, gradient_accumulation, **kwargs):
     config = self.config
     model = self.model
     if isinstance(model, nn.DataParallel):
         model = model.module
     transformer = self._get_transformer_builder()
     if transformer and transformer.trainable:
         transformer = self._get_transformer()
         optimizer = Adam(set(model.parameters()) - set(transformer.parameters()),
                          config.lr,
                          (config.mu, config.nu),
                          config.epsilon)
         if self.config.transformer_lr:
             num_training_steps = len(trn) * epochs // gradient_accumulation
             if not self.config.separate_optimizer:
                 optimizer, scheduler = build_optimizer_scheduler_with_transformer(model,
                                                                                   transformer,
                                                                                   config.lr,
                                                                                   config.transformer_lr,
                                                                                   num_training_steps,
                                                                                   config.warmup_steps,
                                                                                   config.weight_decay,
                                                                                   config.epsilon)
                 transformer_optimizer, transformer_scheduler = None, None
             else:
                 transformer_optimizer, transformer_scheduler = \
                     build_optimizer_scheduler_with_transformer(transformer,
                                                                transformer,
                                                                config.lr,
                                                                config.transformer_lr,
                                                                num_training_steps,
                                                                config.warmup_steps,
                                                                config.weight_decay,
                                                                config.epsilon)
         else:
             transformer.requires_grad_(False)
             transformer_optimizer, transformer_scheduler = None, None
     else:
         optimizer = Adam(model.parameters(),
                          config.lr,
                          (config.mu, config.nu),
                          config.epsilon)
         transformer_optimizer, transformer_scheduler = None, None
     if self.config.separate_optimizer:
         scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
     # noinspection PyUnboundLocalVariable
     optimizer = Adam(model.parameters(), **{'lr': 0.002, 'betas': (0.9, 0.9), 'eps': 1e-12})
     scheduler = ExponentialLR(optimizer, **{'gamma': 0.9999424652406974})
     return optimizer, scheduler, transformer_optimizer, transformer_scheduler
Ejemplo n.º 2
0
 def build_optimizer(self,
                     trn,
                     epochs,
                     lr,
                     adam_epsilon,
                     weight_decay,
                     warmup_steps,
                     transformer_lr,
                     **kwargs):
     # noinspection PyProtectedMember
     if self.use_transformer:
         num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
         optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
                                                                           self._get_transformer(),
                                                                           lr, transformer_lr,
                                                                           num_training_steps, warmup_steps,
                                                                           weight_decay, adam_epsilon)
     else:
         optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr)
         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
             optimizer=optimizer,
             mode='max',
             factor=0.5,
             patience=2,
             verbose=True,
         )
     return optimizer, scheduler
Ejemplo n.º 3
0
 def build_optimizer(self, epochs, trn, gradient_accumulation, **kwargs):
     config = self.config
     model = self.model
     if isinstance(model, nn.DataParallel):
         model = model.module
     if self.config.transformer:
         transformer = model.encoder.transformer
         optimizer = Adam(
             set(model.parameters()) - set(transformer.parameters()),
             config.lr, (config.mu, config.nu), config.epsilon)
         if self.config.transformer_lr:
             num_training_steps = len(trn) * epochs // gradient_accumulation
             if self.config.separate_optimizer:
                 transformer_optimizer, transformer_scheduler = \
                     build_optimizer_scheduler_with_transformer(transformer,
                                                                transformer,
                                                                config.transformer_lr,
                                                                config.transformer_lr,
                                                                num_training_steps,
                                                                config.warmup_steps,
                                                                config.weight_decay,
                                                                adam_epsilon=1e-8)
             else:
                 optimizer, scheduler = build_optimizer_scheduler_with_transformer(
                     model,
                     transformer,
                     config.lr,
                     config.transformer_lr,
                     num_training_steps,
                     config.warmup_steps,
                     config.weight_decay,
                     adam_epsilon=1e-8)
                 transformer_optimizer, transformer_scheduler = None, None
         else:
             transformer.requires_grad_(False)
             transformer_optimizer, transformer_scheduler = None, None
     else:
         optimizer = Adam(model.parameters(), config.lr,
                          (config.mu, config.nu), config.epsilon)
         transformer_optimizer, transformer_scheduler = None, None
     if self.config.separate_optimizer:
         scheduler = ExponentialLR(optimizer,
                                   config.decay**(1 / config.decay_steps))
     # noinspection PyUnboundLocalVariable
     return optimizer, scheduler, transformer_optimizer, transformer_scheduler
Ejemplo n.º 4
0
 def build_optimizer(self, trn, epochs, lr, adam_epsilon, weight_decay,
                     warmup_steps, transformer_lr, gradient_accumulation,
                     **kwargs):
     model = self.model
     num_training_steps = len(trn) * epochs // gradient_accumulation
     optimizer, scheduler = build_optimizer_scheduler_with_transformer(
         model, model.bert_encoder, lr, transformer_lr, num_training_steps,
         warmup_steps, weight_decay, adam_epsilon)
     return optimizer, scheduler
Ejemplo n.º 5
0
 def build_optimizer(self,
                     trn,
                     epochs,
                     lr,
                     adam_epsilon,
                     weight_decay,
                     warmup_steps,
                     transformer_lr=None,
                     teacher=None,
                     **kwargs):
     num_training_steps = len(trn) * epochs // self.config.get(
         'gradient_accumulation', 1)
     if transformer_lr is None:
         transformer_lr = lr
     transformer = self.model.encoder.transformer
     optimizer, scheduler = build_optimizer_scheduler_with_transformer(
         self.model, transformer, lr, transformer_lr, num_training_steps,
         warmup_steps, weight_decay, adam_epsilon)
     if teacher:
         lambda_scheduler = LinearTeacherAnnealingScheduler(
             num_training_steps)
         scheduler = (scheduler, lambda_scheduler)
     return optimizer, scheduler
Ejemplo n.º 6
0
 def build_optimizer(self, trn, epochs, lr, adam_epsilon, weight_decay,
                     warmup_steps, transformer_lr, gradient_accumulation,
                     **kwargs):
     model = self.model
     if self.config.squeeze and False:
         num_training_steps = len(trn) * epochs // gradient_accumulation
         optimizer, scheduler = build_optimizer_scheduler_with_transformer(
             model, model.bert_encoder, lr, transformer_lr,
             num_training_steps, warmup_steps, weight_decay, adam_epsilon)
     else:
         weight_decay_params = []
         no_weight_decay_params = []
         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
         for name, param in model.named_parameters():
             if name.endswith('bias') or 'layer_norm' in name or any(
                     nd in name for nd in no_decay):
                 no_weight_decay_params.append(param)
             else:
                 weight_decay_params.append(param)
         grouped_params = [{
             'params': weight_decay_params,
             'weight_decay': weight_decay
         }, {
             'params': no_weight_decay_params,
             'weight_decay': 0.
         }]
         optimizer = AdamWeightDecayOptimizer(grouped_params,
                                              lr,
                                              betas=(0.9, 0.999),
                                              eps=adam_epsilon)
         lr_scale = self.config.lr_scale
         embed_dim = self.config.embed_dim
         scheduler = torch.optim.lr_scheduler.LambdaLR(
             optimizer, lambda steps: lr_scale * embed_dim**-0.5 * min(
                 (steps + 1)**-0.5, (steps + 1) * (warmup_steps**-1.5)))
     return optimizer, scheduler