def test_should_stop_early_with_decreasing_metric(self): new_trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="-acc", ) tracker = new_trainer._metric_tracker new_tracker = copy.deepcopy(tracker) for acc in [0.02, 0.3, 0.2, 0.1, 0.4, 0.4]: new_tracker.add_metrics({"acc": acc}) assert new_tracker.should_stop_early() new_tracker = copy.deepcopy(tracker) for acc in [0.3, 0.3, 0.2, 0.1, 0.4, 0.5]: new_tracker.add_metrics({"acc": acc}) assert not new_tracker.should_stop_early() new_tracker = copy.deepcopy(tracker) for acc in [0.1, 0.3, 0.2, 0.1, 0.4, 0.5]: new_tracker.add_metrics({"acc": acc}) assert new_tracker.should_stop_early()
def test_trainer_can_log_histograms(self): # enable activation logging for module in self.model.modules(): module.should_log_activations = True trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, tensorboard_writer=TensorboardWriter( serialization_dir=self.TEST_DIR, histogram_interval=2 ), ) trainer.train()
def ensure_model_can_train( self, trainer: GradientDescentTrainer, gradients_to_ignore: Set[str] = None, metric_to_check: str = None, metric_terminal_value: float = None, metric_tolerance: float = 1e-4, disable_dropout: bool = True, ): """ A simple test for model training behavior when you are not using configuration files. In this case, we don't have a story around saving and loading models (you need to handle that yourself), so we don't have tests for that. We just test that the model can train, and that it computes gradients for all parameters. Because the `Trainer` already has a reference to a model and to a data loader, we just take the `Trainer` object itself, and grab the `Model` and other necessary objects from there. # Parameters trainer: `GradientDescentTrainer` The `Trainer` to use for the test, which already has references to a `Model` and a `DataLoader`, which we will use in the test. gradients_to_ignore : `Set[str]`, optional (default=`None`) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). metric_to_check: `str`, optional (default = `None`) We may want to automatically perform a check that model reaches given metric when training (on validation set, if it is specified). It may be useful in CI, for example. You can pass any metric that is in your model returned metrics. metric_terminal_value: `str`, optional (default = `None`) When you set `metric_to_check`, you need to set the value this metric must converge to metric_tolerance: `float`, optional (default=`1e-4`) Tolerance to check you model metric against metric terminal value. One can expect some variance in model metrics when the training process is highly stochastic. disable_dropout : `bool`, optional (default = `True`) If True we will set all dropout to 0 before checking gradients. (Otherwise, with small datasets, you may get zero gradients because of unlucky dropout.) """ metrics = trainer.train() if metric_to_check is not None: metric_value = metrics.get( f"best_validation_{metric_to_check}") or metrics.get( f"training_{metric_to_check}") assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file" assert metric_terminal_value is not None, "Please specify metric terminal value" assert abs(metric_value - metric_terminal_value) < metric_tolerance model_batch = next(iter(trainer.data_loader)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(trainer.model, model_batch, gradients_to_ignore, disable_dropout)
def test_data_loader_lazy_epoch_size_correct_custom_epoch_size(self): self.data_loader_lazy.batches_per_epoch = 3 num_epochs = 3 trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader_lazy, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * 3
def test_metric_only_considered_best_so_far_when_strictly_better_than_those_before_it_decreasing_metric( self, ): new_trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="-test", ) tracker = new_trainer._metric_tracker # when it is the only metric it should be considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metric(1) assert new_tracker.is_best_so_far() # when it is the same as one before it it is not considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.3]) assert not new_tracker.is_best_so_far() # when it is the best it is considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.0013]) assert new_tracker.is_best_so_far() # when it is not the the best it is not considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 13])
def test_can_optimise_model_with_dense_and_sparse_params(self): optimizer_params = Params({"type": "dense_sparse_adam"}) parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params) for instance in self.instances: instance.index_fields(self.vocab) GradientDescentTrainer(self.model, optimizer, SimpleDataLoader(self.instances, 2)).train()
def test_passing_trainer_multiple_gpus_raises_error(self): self.model.cuda() with pytest.raises(ConfigurationError): GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=2, cuda_device=[0, 1], )
def test_regularization(self): penalty = self.model.get_regularization_penalty() assert penalty is None data_loader = PyTorchDataLoader(self.instances, batch_size=32) trainer = GradientDescentTrainer(self.model, None, data_loader) # optimizer, # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(iter(data_loader)) validation_batch = next(iter(data_loader)) training_loss = trainer.batch_outputs(training_batch, for_training=True)["loss"].item() validation_loss = trainer.batch_outputs(validation_batch, for_training=False)["loss"].item() # Training loss should have the regularization penalty, but validation loss should not. numpy.testing.assert_almost_equal(training_loss, validation_loss)
def test_trainer_can_log_learning_rates_tensorboard(self): data_loader = SimpleDataLoader(self.instances, 4) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=[ TensorBoardCallback( serialization_dir=self.TEST_DIR, summary_interval=2, should_log_learning_rate=True, ) ], ) trainer.train()
def test_trainer_can_log_learning_rates_tensorboard(self): data_loader = DataLoader(self.instances, batch_size=4, collate_fn=allennlp_collate) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, tensorboard_writer=TensorboardWriter( serialization_dir=self.TEST_DIR, should_log_learning_rate=True, summary_interval=2, ), ) trainer.train()
def test_trainer_respects_num_serialized_models_to_keep(self): trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=5, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer( serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=3 ), ) trainer.train() # Now check the serialized files for prefix in ["model_state_epoch_*", "training_state_epoch_*"]: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names] assert sorted(epochs) == [2, 3, 4]
def test_trainer_respects_keep_serialized_model_every_num_seconds(self): # To test: # Create an fake data loader that sleeps for 2.5 second per epoch, so the total # training time for one epoch is slightly greater then 2.5 seconds. # Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds. # Check the resulting checkpoints. Should then have models at epochs # 2, 4, plus the last two at 5 and 6. class SlowDataLoader: data_loader = SimpleDataLoader(self.instances, batch_size=2) def __iter__(self): time.sleep(2.5) return iter(self.data_loader) def __len__(self): return len(self.data_loader) def set_target_device(self, _): pass trainer = GradientDescentTrainer( self.model, self.optimizer, SlowDataLoader(), num_epochs=6, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer( serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=2, keep_serialized_model_every_num_seconds=5, ), ) trainer.train() # Now check the serialized files for prefix in ["model_state_epoch_*", "training_state_epoch_*"]: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [ int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names ] # epoch N has N-1 in file name assert sorted(epochs) == [1, 3, 4, 5]
def test_epoch_callback_is_called_at_every_epoch(self): class FakeEpochCallback(EpochCallback): def __call__(self, trainer: "GradientDescentTrainer", metrics: Dict[str, Any], epoch: int) -> None: if not hasattr(trainer, "epoch_callback_calls"): trainer.epoch_callback_calls = [] # type: ignore trainer.epoch_callback_calls.append(epoch) # type: ignore trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=4, validation_data_loader=self.validation_data_loader, epoch_callbacks=[FakeEpochCallback()], ) trainer.train() expected_calls = [epoch for epoch in range(-1, 4)] assert trainer.epoch_callback_calls == expected_calls
def test_trainer_can_log_histograms(self): # enable activation logging for module in self.model.modules(): module.should_log_activations = True trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, callbacks=[ TensorBoardCallback( serialization_dir=self.TEST_DIR, distribution_interval=2, ) ], ) trainer.train()
def test_trainer_saves_models_at_specified_interval(self): data_loader = DataLoader(self.instances, batch_size=4, collate_fn=allennlp_collate) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer( serialization_dir=self.TEST_DIR, model_save_interval=0.0001, num_serialized_models_to_keep=10, ), ) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = "model_state_epoch_*" file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [ re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names ] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == "1" assert "." in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove( os.path.join(self.TEST_DIR, "model_state_epoch_{}.th".format(k))) os.remove( os.path.join(self.TEST_DIR, "training_state_epoch_{}.th".format(k))) os.remove(os.path.join(self.TEST_DIR, "best.th")) restore_trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer(serialization_dir=self.TEST_DIR, model_save_interval=0.0001), ) epoch = restore_trainer._restore_checkpoint() assert epoch == 2 # One batch per epoch. assert restore_trainer._batch_num_total == 2
def test_trainer_saves_metrics_every_epoch(self): trainer = GradientDescentTrainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=5, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer(serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=3), ) trainer.train() for epoch in range(5): epoch_file = self.TEST_DIR / f"metrics_epoch_{epoch}.json" assert epoch_file.exists() metrics = json.load(open(epoch_file)) assert "validation_loss" in metrics assert "best_validation_loss" in metrics assert metrics.get("epoch") == epoch
def test_trainer_respects_epoch_size_smaller_tnan_total(self): batches_per_epoch = 1 num_epochs = 2 data_loader_smaller_epoch = SimpleDataLoader( self.instances, 2, batches_per_epoch=batches_per_epoch, ) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_smaller_epoch, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * batches_per_epoch
def test_trainer_can_run_gradient_accumulation(self): instances = list(self.instances) steps_to_accumulate = 2 trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=2, num_gradient_accumulation_steps=steps_to_accumulate, ) assert trainer._num_gradient_accumulation_steps == steps_to_accumulate metrics = trainer.train() num_batches_trained_per_epoch = trainer._batch_num_total // (metrics["training_epochs"] + 1) num_batches_expected = math.ceil( math.ceil(len(instances) / self.data_loader.batch_size) / steps_to_accumulate ) assert num_batches_trained_per_epoch == num_batches_expected
def test_trainer_saves_and_loads_best_validation_metrics_correctly_2(self): # Use -loss and run 1 epoch of original-training, and one of restored-training # Run 1 epoch of original training. trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, validation_metric="+loss", num_epochs=1, serialization_dir=self.TEST_DIR, ) trainer.train() _ = trainer._restore_checkpoint() best_epoch_1 = trainer._metric_tracker.best_epoch best_validation_metrics_epoch_1 = trainer._metric_tracker.best_epoch_metrics # best_validation_metrics_epoch_1: {'accuracy': 0.75, 'accuracy3': 1.0, 'loss': 0.6243013441562653} assert isinstance(best_validation_metrics_epoch_1, dict) assert "loss" in best_validation_metrics_epoch_1 # Run 1 more epoch of restored training. restore_trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, validation_metric="+loss", num_epochs=2, serialization_dir=self.TEST_DIR, ) restore_trainer.train() _ = restore_trainer._restore_checkpoint() best_epoch_2 = restore_trainer._metric_tracker.best_epoch best_validation_metrics_epoch_2 = restore_trainer._metric_tracker.best_epoch_metrics # Because of using +loss, 2nd epoch won't be better than 1st. So best val metrics should be same. assert best_epoch_1 == best_epoch_2 == 0 assert best_validation_metrics_epoch_2 == best_validation_metrics_epoch_1
def test_data_loader_lazy_epoch_size_correct_custom_epoch_size(self): batches_per_epoch = 3 num_epochs = 3 data_loader_custom_epoch_lazy = PyTorchDataLoader( self.instances_lazy, batch_size=2, collate_fn=allennlp_collate, batches_per_epoch=batches_per_epoch, ) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_custom_epoch_lazy, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * batches_per_epoch
def test_total_loss_is_average_of_batch_loss(self): batches_per_epoch = 3 data_loader_custom_epoch_lazy = PyTorchDataLoader( self.instances_lazy, batch_size=2, collate_fn=allennlp_collate, batches_per_epoch=batches_per_epoch, ) class FakeBatchCallback(BatchCallback): def __call__( self, trainer: "GradientDescentTrainer", batch_inputs: List[List[TensorDict]], batch_outputs: List[Dict[str, Any]], batch_metrics: Dict[str, Any], epoch: int, batch_number: int, is_training: bool, is_master: bool, ) -> None: if not hasattr(trainer, "batch_losses"): trainer.batch_losses = [] # type: ignore trainer.batch_losses.append( batch_outputs[0]["loss"].item()) # type: ignore trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_custom_epoch_lazy, num_epochs=1, batch_callbacks=[FakeBatchCallback()], ) metrics = trainer.train() assert metrics["training_loss"] == float( sum(trainer.batch_losses) / batches_per_epoch)
def test_trainer_respects_epoch_size_larger_tnan_total(self): batches_per_epoch = 7 num_epochs = 3 data_loader_larger_epoch = AllennlpDataLoader( self.instances, batch_size=2, collate_fn=allennlp_collate, batches_per_epoch=batches_per_epoch, ) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_larger_epoch, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * batches_per_epoch
def init_trainer(self) -> Trainer: parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters, lr=self.config.lr) # type: ignore trainer = GradientDescentTrainer( model=self.model, serialization_dir='./output', data_loader=self.train_data_loader, validation_data_loader=self.dev_data_loader, num_epochs=self.config.epoch, optimizer=optimizer, cuda_device=self.config.device, ) return trainer
def test_trainer_can_run_and_resume_with_momentum_scheduler(self): scheduler = MomentumScheduler.from_params( optimizer=self.optimizer, params=Params({ "type": "inverted_triangular", "cool_down": 2, "warm_up": 2 }), ) trainer = GradientDescentTrainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, momentum_scheduler=scheduler, validation_metric="-loss", validation_data_loader=self.validation_data_loader, num_epochs=4, serialization_dir=self.TEST_DIR, ) trainer.train() new_scheduler = MomentumScheduler.from_params( optimizer=self.optimizer, params=Params({ "type": "inverted_triangular", "cool_down": 2, "warm_up": 2 }), ) new_trainer = GradientDescentTrainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, momentum_scheduler=new_scheduler, validation_metric="-loss", validation_data_loader=self.validation_data_loader, num_epochs=6, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 4 assert new_trainer._momentum_scheduler.last_epoch == 3 new_trainer.train()
def objective_fn( trial: Trial, device: int, direction: str, target_metric: str, base_serialization_dir: str, ): embedding_dim = trial.suggest_int("embedding_dim", 128, 256) max_filter_size = trial.suggest_int("max_filter_size", 3, 6) num_filters = trial.suggest_int("num_filters", 128, 256) output_dim = trial.suggest_int("output_dim", 128, 512) dropout = trial.suggest_float("dropout", 0, 1.0, log=False) lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True) train_dataset, valid_dataset, vocab = prepare_data() model = create_model(vocab, embedding_dim, max_filter_size, num_filters, output_dim, dropout) if device > -1: model.to(torch.device("cuda:{}".format(device))) optimizer = SGD(model.parameters(), lr=lr) data_loader = DataLoader(train_dataset, batch_size=10, collate_fn=allennlp_collate) validation_data_loader = DataLoader(valid_dataset, batch_size=64, collate_fn=allennlp_collate) serialization_dir = os.path.join(base_serialization_dir, "trial_{}".format(trial.number)) trainer = GradientDescentTrainer( model=model, optimizer=optimizer, data_loader=data_loader, validation_data_loader=validation_data_loader, validation_metric=("+" if direction == "MAXIMIZE" else "-") + target_metric, patience=None, # `patience=None` since it could conflict with AllenNLPPruningCallback num_epochs=50, cuda_device=device, serialization_dir=serialization_dir, epoch_callbacks=[AllenNLPPruningCallback(trial, f"validation_{target_metric}")], ) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) return trainer.train()[f"best_validation_{target_metric}"]
def test_sanity_check_default(self): model_with_bias = FakeModelForTestingNormalizationBiasVerification(use_bias=True) inst = Instance({"x": TensorField(torch.rand(3, 1, 4))}) data_loader = SimpleDataLoader([inst, inst], 2) trainer = GradientDescentTrainer.from_partial_objects( model_with_bias, serialization_dir=self.TEST_DIR, data_loader=data_loader, num_epochs=1, ) with pytest.raises(SanityCheckError): trainer.train() trainer = GradientDescentTrainer.from_partial_objects( model_with_bias, serialization_dir=self.TEST_DIR, data_loader=data_loader, num_epochs=1, run_sanity_checks=False, ) # Check is not run, so no failure. trainer.train()
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, "None"]: with pytest.raises( ConfigurationError, match='.* is an invalid value for "patience": ' "it must be a positive integer or None " "\\(if you want to disable early stopping\\)", ): GradientDescentTrainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=100, patience=patience, validation_metric="+test", )
def build_trainer( model: Model, serialization_dir: str, train_loader: DataLoader, dev_loader: DataLoader, ) -> Trainer: parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters) # type: ignore trainer = GradientDescentTrainer( model=model, serialization_dir=serialization_dir, data_loader=train_loader, validation_data_loader=dev_loader, num_epochs=5, optimizer=optimizer, ) return trainer
def trainer_ctor(tagger, corpus_len, train_dataloader, val_dataloader): optimizer = AdamW(tagger.parameters(), lr=self._lr, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01, correct_bias=True) # lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=self._patience) # # trainer = ModelTrainerBert(model=seq_tagger, # optimizer=optimizer, # lr_scheduler=lr_scheduler, # train_dataset=train_data, # val_dataset=val_data, # validation_metrics=[f1_entity_level], # batch_size=self._bs, # update_scheduler='ee', # keep_best_model=True, # restore_bm_on_lr_change=True, # max_grad_norm=1., # smallest_lr=self._lr / 4) lr_scheduler = ReduceOnPlateauLearningRateScheduler( optimizer, mode='max', factor=0.5, patience=self._patience) trainer = GradientDescentTrainer( model=tagger, validation_metric='-loss', optimizer=optimizer, data_loader=train_dataloader, validation_data_loader=val_dataloader, num_epochs=self._n_epochs, # cuda_device=cuda_device, learning_rate_scheduler=lr_scheduler, patience=self._patience, num_gradient_accumulation_steps=self._bs) return trainer
def init_trainer(self) -> Trainer: parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] group_parameter_group = [(['_text_field_embedder.*'], { 'lr': self.config.lr }), (['_classification_layer.*'], { 'lr': self.config.classifier_lr })] optimizer = AdamOptimizer(parameters, parameter_groups=group_parameter_group, lr=self.config.lr) # type: ignore trainer = GradientDescentTrainer( model=self.model, serialization_dir='./output', data_loader=self.train_data_loader, validation_data_loader=self.dev_data_loader, num_epochs=self.config.epoch, optimizer=optimizer, cuda_device=self.config.device, ) return trainer