def configure_optimizers(self: Model): num_epoch_steps = (len(dataset[datasets.Split.TRAIN]) + self.hparams.batch_size - 1) // self.hparams.batch_size num_train_steps = num_epoch_steps * self.hparams.max_epochs optimizer, scheduler = optimization.create_optimizer( self, lr=self.hparams.lr, num_train_steps=num_train_steps, weight_decay=self.hparams.weight_decay, warmup_steps=self.hparams.warmup_steps, warmup_proportion=self.hparams.warmup_proportion, layerwise_lr_decay_power=self.hparams.layerwise_lr_decay_power, n_transformer_layers=self.transformer.config.num_hidden_layers, lr_scheduler=optimization.get_polynomial_decay_schedule_with_warmup, lr_scheduler_kwargs={ 'lr_end': self.hparams.lr_end, 'power': self.hparams.lr_decay_power } ) return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]
def configure_optimizers(self: Model): num_epoch_steps = sum( len(dataset) for dataset in distill_datasets.values()) num_train_steps = num_epoch_steps * self.hparams.max_epochs setattr(self, 'num_train_steps', num_train_steps) optimizer, scheduler = optimization.create_optimizer( self, lr=self.hparams.lr, num_train_steps=num_train_steps, weight_decay=self.hparams.weight_decay, warmup_steps=self.hparams.warmup_steps, warmup_proportion=self.hparams.warmup_proportion, layerwise_lr_decay_power=self.hparams.layerwise_lr_decay_power, n_transformer_layers=self.transformer.config.num_hidden_layers, get_layer_lrs=optimization.get_layer_lrs_with_crf, get_layer_lrs_kwargs={'crf_preffix': 'rel_crf'}, lr_scheduler=optimization. get_polynomial_decay_schedule_with_warmup, lr_scheduler_kwargs={ 'lr_end': self.hparams.lr_end, 'power': self.hparams.lr_decay_power }) return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]