def test_noam_learning_rate_schedule_does_not_crash(self):
     model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     lrs = LearningRateScheduler.from_params(Optimizer.from_params(model.named_parameters(),
                                                                   Params({"type": "adam"})),
                                             Params({"type": "noam", "model_size": 10, "warmup_steps": 2000}))
     lrs.step(None)
     lrs.step_batch(None)
Example #2
0
    def test_optimizer_parameter_groups(self):
        optimizer_params = Params({
                "type": "sgd",
                "lr": 1,
                "momentum": 5,
                "parameter_groups": [
                        # the repeated "bias_" checks a corner case
                        # NOT_A_VARIABLE_NAME displays a warning but does not raise an exception
                        [["weight_i", "bias_", "bias_", "NOT_A_VARIABLE_NAME"], {'lr': 2}],
                        [["tag_projection_layer"], {'lr': 3}],
                ]
        })
        parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, optimizer_params)
        param_groups = optimizer.param_groups

        assert len(param_groups) == 3
        assert param_groups[0]['lr'] == 2
        assert param_groups[1]['lr'] == 3
        # base case uses default lr
        assert param_groups[2]['lr'] == 1
        for k in range(3):
            assert param_groups[k]['momentum'] == 5

        # all LSTM parameters except recurrent connections (those with weight_h in name)
        assert len(param_groups[0]['params']) == 6
        # just the projection weight and bias
        assert len(param_groups[1]['params']) == 2
        # the embedding + recurrent connections left in the default group
        assert len(param_groups[2]['params']) == 3
Example #3
0
 def test_can_optimise_model_with_dense_and_sparse_params(self):
     optimizer_params = Params({
             "type": "dense_sparse_adam"
     })
     parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
     optimizer = Optimizer.from_params(parameters, optimizer_params)
     iterator = BasicIterator(2)
     iterator.index_with(self.vocab)
     Trainer(self.model, optimizer, iterator, self.instances).train()
    def test_reduce_on_plateau_error_throw_when_no_metrics_exist(self):
        model = torch.nn.Sequential(torch.nn.Linear(10, 10))
        with self.assertRaises(ConfigurationError) as context:
            LearningRateScheduler.from_params(Optimizer.from_params(model.named_parameters(),
                                                                    Params({"type": "adam"})),
                                              Params({"type": "reduce_on_plateau"})).step(None, None)

        self.assertTrue(
                'The reduce_on_plateau learning rate scheduler requires a validation metric'
                in str(context.exception))
Example #5
0
 def test_optimizer_basic(self):
     optimizer_params = Params({
             "type": "sgd",
             "lr": 1
     })
     parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
     optimizer = Optimizer.from_params(parameters, optimizer_params)
     param_groups = optimizer.param_groups
     assert len(param_groups) == 1
     assert param_groups[0]['lr'] == 1
Example #6
0
    def from_params(cls,
                    model: Model,
                    serialization_dir: str,
                    iterator: DataIterator,
                    train_data: Iterable[Instance],
                    validation_data: Optional[Iterable[Instance]],
                    params: Params,
                    validation_iterator: DataIterator = None) -> 'Trainer':

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = params.pop_int("cuda_device", -1)
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)

        if cuda_device >= 0:
            model = model.cuda(cuda_device)
        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        if lr_scheduler_params:
            scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
        else:
            scheduler = None

        num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20)
        keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)

        params.assert_empty(cls.__name__)
        return Trainer(model, optimizer, iterator,
                       train_data, validation_data,
                       patience=patience,
                       validation_metric=validation_metric,
                       validation_iterator=validation_iterator,
                       num_epochs=num_epochs,
                       serialization_dir=serialization_dir,
                       cuda_device=cuda_device,
                       grad_norm=grad_norm,
                       grad_clipping=grad_clipping,
                       learning_rate_scheduler=scheduler,
                       num_serialized_models_to_keep=num_serialized_models_to_keep,
                       keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds,
                       model_save_interval=model_save_interval,
                       summary_interval=summary_interval,
                       histogram_interval=histogram_interval)
Example #7
0
    def test_parameter_type_inference(self):
        # Should work ok even with lr as a string
        optimizer_params = Params({
                "type": "sgd",
                "lr": "0.1"
        })

        parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, optimizer_params)

        assert optimizer.defaults["lr"] == 0.1

        # But should crash (in the Pytorch code) if we don't do the type inference
        optimizer_params = Params({
                "type": "sgd",
                "lr": "0.1",
                "infer_type_and_cast": False
        })

        parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]

        with pytest.raises(TypeError):
            optimizer = Optimizer.from_params(parameters, optimizer_params)
Example #8
0
    def from_params(cls,
                    params: Params,
                    serialization_dir: str,
                    recover: bool = False,
                    cache_directory: str = None,
                    cache_prefix: str = None) -> 'PtTrainer':
        max_src_len = params.dataset_reader.get('max_src_len', None)
        all_datasets = training_util.datasets_from_params(
            params, cache_directory, cache_prefix)
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info(
            "From dataset instances, %s will be considered for vocabulary creation.",
            ", ".join(datasets_for_vocab_creation))

        if recover and os.path.exists(
                os.path.join(serialization_dir, "vocabulary")):
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"))
            params.pop("vocabulary", {})
        else:
            vocab = Vocabulary.from_params(params.pop(
                "vocabulary", {}), (instance
                                    for key, dataset in all_datasets.items()
                                    if key in datasets_for_vocab_creation
                                    for instance in dataset))

        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets['train']
        validation_data = all_datasets.get('validation')
        test_data = all_datasets.get('test')

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = \
            get_frozen_and_tunable_parameter_names(model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        params = trainer_params

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            max_src_len=max_src_len,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            batch_size=iterator._batch_size)
 def test_reduce_on_plateau_works_when_metrics_exist(self):
     model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     LearningRateScheduler.from_params(Optimizer.from_params(model.named_parameters(),
                                                             Params({"type": "adam"})),
                                       Params({"type": "reduce_on_plateau"})).step(10, None)
Example #10
0
    def __init__(self,
                 model: Model,
                 task_list: List[Task],
                 optimizer_params: Params,
                 lr_scheduler_params: Params,
                 patience: Optional[int] = None,
                 num_epochs: int = 20,
                 serialization_dir: Optional[str] = None,
                 cuda_device: int = -1,
                 gradient_accumulation_steps: int = 1,
                 grad_norm: Optional[float] = None,
                 grad_clipping: Optional[float] = None,
                 min_lr: float = 0.00001,
                 no_tqdm: bool = False,
                 momentum_scheduler: Optional[MomentumScheduler] = None,
                 summary_interval: int = 50,
                 histogram_interval: int = 50,
                 should_log_parameter_statistics: bool = True,
                 should_log_learning_rate: bool = True,
                 sampling_method: str = "proportional",
                 moving_average: Optional[MovingAverage] = None) -> None:

        self._model = model

        self._task_list = task_list
        self._n_tasks = len(self._task_list)

        self._optimizers = {}
        self._all_params = [(n, p) for n, p in self._model.named_parameters()
                            if p.requires_grad]
        self._params_share_encoder = [
            (n, p) for n, p in self._model.named_parameters()
            if p.requires_grad and "_shared_encoder" in n
        ]
        self._params_share_encoder_de = [
            (n, p) for n, p in self._model.named_parameters()
            if p.requires_grad and
            ("_shared_encoder" in n or "_seq_vec" in n or "_domain_embeddings"
             in n or "_de_attention" in n or "_de_feedforward" in n)
        ]
        self._params_share_discriminator = [
            (n, p) for n, p in self._model.named_parameters()
            if p.requires_grad and "_s_domain_discriminator" in n
        ]
        self._params_valid_discriminator = [
            (n, p) for n, p in self._model.named_parameters()
            if p.requires_grad and "_valid_discriminator" in n
        ]

        self._optimizers = dict()
        self._optimizers["all_params"] = Optimizer.from_params(
            model_parameters=self._all_params,
            params=deepcopy(optimizer_params))
        self._optimizers["share_encoder"] = Optimizer.from_params(
            model_parameters=self._params_share_encoder,
            params=deepcopy(optimizer_params))
        self._optimizers["share_encoder_de"] = Optimizer.from_params(
            model_parameters=self._params_share_encoder_de,
            params=deepcopy(optimizer_params))
        self._optimizers["share_discriminator"] = Optimizer.from_params(
            model_parameters=self._params_share_discriminator,
            params=deepcopy(optimizer_params))
        self._optimizers["valid_discriminator"] = Optimizer.from_params(
            model_parameters=self._params_valid_discriminator,
            params=deepcopy(optimizer_params))
        self._schedulers = dict()
        self._schedulers["all_params"] = LearningRateScheduler.from_params(
            optimizer=self._optimizers["all_params"],
            params=deepcopy(lr_scheduler_params))
        self._schedulers["share_encoder"] = LearningRateScheduler.from_params(
            optimizer=self._optimizers["share_encoder"],
            params=deepcopy(lr_scheduler_params))
        self._schedulers[
            "share_encoder_de"] = LearningRateScheduler.from_params(
                optimizer=self._optimizers["share_encoder_de"],
                params=deepcopy(lr_scheduler_params))
        self._schedulers[
            "share_discriminator"] = LearningRateScheduler.from_params(
                optimizer=self._optimizers["share_discriminator"],
                params=deepcopy(lr_scheduler_params))
        self._schedulers[
            "valid_discriminator"] = LearningRateScheduler.from_params(
                optimizer=self._optimizers["valid_discriminator"],
                params=deepcopy(lr_scheduler_params))
        self._serialization_dir = serialization_dir
        self._cuda_device = cuda_device
        if self._cuda_device >= 0:
            check_for_gpu(self._cuda_device)
            self._model = self._model.cuda(self._cuda_device)
        self._patience = patience
        self._num_epochs = num_epochs
        self._epoch_trained = 0

        self._gradient_accumulation_steps = gradient_accumulation_steps
        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping
        self._min_lr = min_lr
        self._no_tqdm = no_tqdm
        self._sampling_method = sampling_method

        self._task_infos = None
        self._metric_infos = None

        self._tr_generators = None
        self._global_step = 0

        self._batch_num_total = 0

        self._tensorboard = TensorboardWriter(
            get_batch_num_total=lambda: self._batch_num_total,
            serialization_dir=serialization_dir,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate)

        self._last_log = 0.0  # time of last logging

        # Enable activation logging.
        if histogram_interval is not None:
            self._tensorboard.enable_activation_logging(self._model)
Example #11
0
 def test_reduce_on_plateau_works_when_metrics_exist(self):
     LearningRateScheduler.from_params(
         Optimizer.from_params(self.model.named_parameters(),
                               Params({"type": "adam"})),
         Params({"type": "reduce_on_plateau"})).step(10, None)
Example #12
0
 def test_can_optimise_model_with_dense_and_sparse_params(self):
     optimizer_params = Params({"type": "dense_sparse_adam"})
     parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
     optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params)
     self.instances.index_with(self.vocab)
     GradientDescentTrainer(self.model, optimizer, DataLoader(self.instances, 2)).train()
Example #13
0
    def __init__(
        self,
        model: Model,
        task_list: List[Task],
        optimizer_params: Params,
        lr_scheduler_params: Params,
        patience: Optional[int] = None,
        num_epochs: int = 20,
        serialization_dir: str = None,
        cuda_device: int = -1,
        gradient_accumulation_steps: int = 1,
        grad_norm: Optional[float] = None,
        grad_clipping: Optional[float] = None,
        min_lr: float = 0.00001,
        no_tqdm: bool = False,
        summary_interval: int = 10,
        histogram_interval: int = 10,
        log_parameter_statistics: bool = False,
        log_gradient_statistics: bool = False,
    ):
        """ 
        Parameters
        ----------
        model: ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.
        iterator: ``DataIterator``, required.
            A method for iterating over a ``Dataset``, yielding padded indexed batches.
        patience: Optional[int] > 0, optional (default=None)
            Number of epochs to be patient before early stopping: the training is stopped
            after ``patience`` epochs with no improvement. If given, it must be ``> 0``.
            If None, early stopping is disabled.
        num_epochs: int, optional (default = 20)
            Number of training epochs.
        serialization_dir: str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        cuda_device: int, optional (default = -1)
            An integer specifying the CUDA device to use. If -1, the CPU is used.
            Multi-gpu training is not currently supported, but will be once the
            Pytorch DataParallel API stabilises.
        grad_norm: float, optional, (default = None).
            If provided, gradient norms will be rescaled to have a maximum of this value.
        grad_clipping : float, optional (default = None).
            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
            maximum of this value.  If you are getting ``NaNs`` in your gradients during training
            that are not solved by using ``grad_norm``, you may need this.
        no_tqdm : bool, optional (default=False)
            We use ``tqdm`` for logging, which will print a nice progress bar that updates in place
            after every batch.  This is nice if you're running training on a local shell, but can
            cause problems with log files from, e.g., a docker image running on kubernetes.  If
            ``no_tqdm`` is ``True``, we will not use tqdm, and instead log batch statistics using
            ``logger.info``.
        """
        self._model = model
        parameters_to_train = [(n, p)
                               for n, p in self._model.named_parameters()
                               if p.requires_grad]

        self._task_list = task_list
        self._n_tasks = len(self._task_list)

        self._optimizer_params = optimizer_params
        self._optimizers = {}
        self._lr_scheduler_params = lr_scheduler_params
        self._schedulers = {}
        for task in self._task_list:
            task_name = task._name
            self._optimizers[task_name] = Optimizer.from_params(
                model_parameters=parameters_to_train,
                params=deepcopy(optimizer_params))
            self._schedulers[task_name] = LearningRateScheduler.from_params(
                optimizer=self._optimizers[task_name],
                params=deepcopy(lr_scheduler_params))

        self._serialization_dir = serialization_dir

        self._patience = patience
        self._num_epochs = num_epochs
        self._cuda_device = cuda_device
        if self._cuda_device >= 0:
            check_for_gpu(self._cuda_device)
            self._model = self._model.cuda(self._cuda_device)
        self._gradient_accumulation_steps = gradient_accumulation_steps
        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping
        self._min_lr = min_lr

        self._task_infos = None
        self._metric_infos = None

        self._tr_generators = None
        self._no_tqdm = no_tqdm

        self._summary_interval = summary_interval  # num batches between logging to tensorboard
        # self._log_parameter_statistics = log_parameter_statistics
        # self._log_gradient_statistics = log_gradient_statistics
        self._global_step = 0
        # train_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "train"))
        # validation_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "validation"))
        # self._tensorboard = TensorboardWriter(train_log=train_log, validation_log=validation_log)

        self._batch_num_total = 0
        self._tensorboard = TensorboardWriter(
            get_batch_num_total=lambda: self._batch_num_total,
            serialization_dir=serialization_dir,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_learning_rate=True)

        # Enable activation logging.
        if histogram_interval is not None:
            self._tensorboard.enable_activation_logging(self._model)
Example #14
0
 def _get_optimizer(self, lr: float = 1.0):
     return Optimizer.from_params(self.model.named_parameters(), Params({"type": "sgd", "lr": lr}))
Example #15
0
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False) -> 'Trainer':

        # modified for second training_data
        all_datasets = datasets_from_params(params)

        # copied from allennlp.training.trainer.TrainingPieces
        # modified for second training_data
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        if recover and os.path.exists(
                os.path.join(serialization_dir, "vocabulary")):
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"))
            params.pop("vocabulary", {})
        else:
            vocab = Vocabulary.from_params(params.pop(
                "vocabulary", {}), (instance
                                    for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        model.extend_embedder_vocab()
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets['train']
        validation_data = all_datasets.get('validation')
        test_data = all_datasets.get('test')
        train_low_data = all_datasets.get('train_low')

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = \
                    get_frozen_and_tunable_parameter_names(model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        # END OF TrainerPieces code
        params = trainer_params

        # pylint: disable=arguments-differ
        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        epoch_low_start = params.pop_int("epoch_low_start", None)
        epoch_without_improvement_low_start = params.pop_int(
            "epoch_without_improvement_low_start", None)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            train_low_dataset=train_low_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            epoch_low_start=epoch_low_start,
            epoch_without_improvement_low_start=
            epoch_without_improvement_low_start,
        )
Example #16
0
    def test_checkpointing_does_run(self, build_trainer_params_function):
        # Check that checkpointing does run and does sanity checks that at each step
        # it saves the most recent checkpoint as well as the best checkpoint
        # correctly for both pretrain and target_train stages.
        with mock.patch("jiant.models.MultiTaskModel") as MockModel:
            import torch
            import copy
            import time
            from allennlp.common.params import Params

            MockModel.return_value.eval.return_value = None
            MockModel.return_value.state_dict.return_value = {
                "model1": {
                    "requires_grad": True
                }
            }
            pad_dict = self.wic.val_data[0].get_padding_lengths()
            sorting_keys = []
            for field in pad_dict:
                for pad_field in pad_dict[field]:
                    sorting_keys.append((field, pad_field))
            iterator = BucketIterator(
                sorting_keys=sorting_keys,
                max_instances_in_memory=10000,
                batch_size=4,
                biggest_batch_first=True,
            )
            opt_params = Params({"type": "adam", "lr": 1e-05})
            opt_params2 = copy.deepcopy(opt_params)
            scheduler_params = Params({
                "type": "reduce_on_plateau",
                "factor": 0.05,
                "mode": "max",
                "patience": 4,
                "threshold": 0.05,
                "threshold_mode": "abs",
                "verbose": True,
            })
            train_params = [
                (
                    "_text_field_embedder.model.encoder.layer.9.output.dense.bias",
                    torch.Tensor([0.1, 0.3, 0.4, 0.8]),
                ),
                ("sent_encoder.layer.1", torch.Tensor([0.1, 0.3, 0.4, 0.8])),
                ("type", torch.Tensor([0.1])),
            ]
            scheduler = LearningRateScheduler.from_params(
                Optimizer.from_params(train_params, opt_params2),
                copy.deepcopy(scheduler_params))
            optimizer = Optimizer.from_params(train_params,
                                              copy.deepcopy(opt_params))
            _task_infos = {
                "wic": {
                    "iterator": iterator(self.wic.val_data, num_epochs=1),
                    "n_tr_batches": 1,
                    "loss": 0.0,
                    "tr_generator": iterator(self.wic.val_data, num_epochs=1),
                    "total_batches_trained": 400,
                    "n_batches_since_val": 0,
                    "optimizer": optimizer,
                    "scheduler": scheduler,
                    "stopped": False,
                    "last_log": time.time(),
                }
            }
            _metric_infos = {
                metric: {
                    "hist": [],
                    "stopped": False,
                    "best": (-1, {})
                }
                for metric in [self.wic.val_metric]
            }
            MockModel.return_value._setup_training.return_value = _task_infos, _metric_infos

            class MockParams:
                def __init__(self, requires_grad):
                    self.requires_grad = requires_grad

            MockModel.return_value.named_parameters.return_value = [
                ("model1", MockParams(True))
            ]
            MockModel.use_bert = 1
            model = MockModel()
            pt_trainer, _, _, _ = trainer.build_trainer(
                self.args,
                [
                    "wic"
                ],  # here, we use WIC twice to reduce the amount of boiler-plate code
                model,
                self.args.run_dir,
                self.wic.val_metric_decreases,
                phase="pretrain",
            )

            tt_trainer, _, _, _ = trainer.build_trainer(
                self.args,
                ["wic"],
                model,
                self.args.run_dir,
                self.wic.val_metric_decreases,
                phase="target_train",
            )
            os.mkdir(os.path.join(self.temp_dir, "wic"))

            tt_trainer.task_to_metric_mapping = {
                self.wic.val_metric: self.wic.name
            }
            pt_trainer._task_infos = _task_infos
            pt_trainer._metric_infos = _metric_infos
            pt_trainer._optimizer = optimizer
            pt_trainer._scheduler = scheduler
            pt_trainer._save_checkpoint(
                {
                    "step": 10,
                    "validation_pass": 1,
                    "should_stop": 0
                },
                tasks=[self.wic],
                phase="pretrain",
                new_best=True,
            )
            pt_trainer._save_checkpoint(
                {
                    "step": 10,
                    "validation_pass": 2,
                    "should_stop": 0
                },
                tasks=[self.wic],
                phase="pretrain",
                new_best=True,
            )
            tt_trainer._task_infos = _task_infos
            tt_trainer._metric_infos = _metric_infos
            tt_trainer._optimizer = optimizer
            tt_trainer._scheduler = scheduler

            tt_trainer._save_checkpoint(
                {
                    "step": 10,
                    "validation_pass": 1,
                    "should_stop": 0
                },
                tasks=[self.wic],
                phase="target_train",
                new_best=True,
            )
            tt_trainer._save_checkpoint(
                {
                    "step": 10,
                    "validation_pass": 2,
                    "should_stop": 0
                },
                tasks=[self.wic],
                phase="target_train",
                new_best=False,
            )
            assert (os.path.exists(
                os.path.join(self.temp_dir, "wic",
                             "model_state_target_train_val_1.best.th"))
                    and os.path.exists(
                        os.path.join(self.temp_dir, "wic",
                                     "model_state_target_train_val_2.th"))
                    and os.path.exists(
                        os.path.join(self.temp_dir,
                                     "model_state_pretrain_val_2.best.th"))
                    and os.path.exists(
                        os.path.join(self.temp_dir,
                                     "model_state_pretrain_val_1.th")))

            # Assert only one checkpoint is created for pretrain stage.
            pretrain_best_checkpoints = glob.glob(
                os.path.join(self.temp_dir,
                             "model_state_pretrain_val_*.best.th"))
            assert len(pretrain_best_checkpoints) == 1
 def test_no_metric_wrapper_can_support_none_for_metrics(self):
     lrs = LearningRateScheduler.from_params(Optimizer.from_params(self.model.named_parameters(),
                                                                   Params({"type": "adam"})),
                                             Params({"type": "step", "step_size": 1}))
     lrs.step(None, None)
 def test_noam_learning_rate_schedule_does_not_crash(self):
     lrs = LearningRateScheduler.from_params(Optimizer.from_params(self.model.named_parameters(),
                                                                   Params({"type": "adam"})),
                                             Params({"type": "noam", "model_size": 10, "warmup_steps": 2000}))
     lrs.step(None, None)
     lrs.step_batch(None)
 def test_reduce_on_plateau_error_throw_when_mode_not_specified(self):
     with self.assertRaises(ConfigurationError) as context:
         LearningRateScheduler.from_params(Optimizer.from_params(self.model.named_parameters(),
                                                                 Params({"type": "adam"})),
                                           Params({"type": "reduce_on_plateau"})).step(None, None)
         assert "ReduceLROnPlateau requires a mode to be specified" in str(context.exception)
Example #20
0
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None) -> 'Trainer':
        # datasets = meta_dataset_from_params(params, cache_directory=cache_directory, cache_prefix=cache_prefix)
        # model = Model.from_params(vocab=vocab, params=params.pop("model"))
        # iterator = DataIterator.from_params(params.pop("iterator"))
        # iterator.index_with(model.vocab)
        pieces = MetaTrainerPieces.from_params(params, serialization_dir,
                                               recover, cache_directory,
                                               cache_prefix)
        model = pieces.model
        iterator = pieces.iterator,
        # params=pieces.params,
        train_data = pieces.train_dataset
        validation_data = pieces.validation_dataset
        validation_iterator = pieces.validation_iterator
        params = pieces.params

        # pylint: disable=arguments-differ
        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", [0, 1]))

        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)
        print('[info] cuda_device in metatrainer.from_param is:{}'.format(
            cuda_device))

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            # distributed=distributed,
            # rank=local_rank,
            # world_size=world_size,
            # num_gradient_accumulation_steps=num_gradient_accumulation_steps,
        )
Example #21
0
 def _get_optimizer(self):
     return Optimizer.from_params(
         model_parameters=self.model.named_parameters(),
         params=Params({"type": "sgd", "lr": 1.0, "momentum": self.base_momentum}),
     )
Example #22
0
model_fp = str(Path('.', 'model_configs', 'standard', 'ds_elmo_t_fine_tune_laptop.json').resolve())
params = Params.from_file(model_fp)
data_fp = str(Path('data', 'splits', 'Laptop Test').resolve())
reader = DatasetReader.from_params(params['dataset_reader'])
instances = list(reader.read(data_fp))

if 'vocabulary' in params:
    vocab_params = params['vocabulary']
    vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
    vocab = Vocabulary.from_instances(instances)
import re
model = Model.from_params(vocab=vocab, params=params['model'])
names = [name for name, param in model.named_parameters()]
optim = Optimizer.from_params(model.named_parameters(),params['trainer']['optimizer'])
values = {}
for i, param_group in enumerate(reversed(optim.param_groups)):
    values[i] = param_group['params']
        
encoder = set()
project = set()
embedder_5 = set()
embedder_4 = set()
embedder_3 = set()
embedder_2 = set()
embedder_1 = set()
embedder_0 = set()
embedder_char = set()
embedding_scalar_mix = set()
other = set()
Example #23
0
 def _get_optimizer(self, lr: float = 1.0):
     optimizer_params = Params({"type": "sgd", "lr": lr})
     optimizer_params["parameter_groups"] = [[[f"^{m}"], {}]
                                             for m in self.model._modules]
     return Optimizer.from_params(self.model.named_parameters(),
                                  optimizer_params)
Example #24
0
train_loader = VCRLoader.from_dataset(train, **loader_params)
val_loader = VCRLoader.from_dataset(val, **loader_params)
test_loader = VCRLoader.from_dataset(test, **loader_params)

ARGS_RESET_EVERY = 100
print("Loading {} for {}".format(params['model'].get('type', 'WTF?'),
                                 'rationales' if args.rationale else 'answer'),
      flush=True)
model = Model.from_params(vocab=train.vocab, params=params['model'])
# for submodule in model.detector.backbone.modules():
#     if isinstance(submodule, BatchNorm2d):
#         submodule.track_running_stats = False

model = DataParallel(model).cuda() if NUM_GPUS > 1 else model.cuda()
optimizer = Optimizer.from_params(
    [x for x in model.named_parameters() if x[1].requires_grad],
    params['trainer']['optimizer'])

lr_scheduler_params = params['trainer'].pop("learning_rate_scheduler", None)
scheduler = LearningRateScheduler.from_params(
    optimizer, lr_scheduler_params) if lr_scheduler_params else None

if os.path.exists(args.folder) and args.restore:
    print('restore is True')
    print("Found folder! restoring", flush=True)
    start_epoch, val_metric_per_epoch = restore_checkpoint(
        model,
        optimizer,
        serialization_dir=args.folder,
        learning_rate_scheduler=scheduler)
else:
Example #25
0
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None) -> 'SloppyTrainer':
        # pylint: disable=arguments-differ
        from allennlp.training.trainer_pieces import TrainerPieces

        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        model = pieces.model
        serialization_dir = serialization_dir
        iterator = pieces.iterator
        train_data = pieces.train_dataset
        validation_data = pieces.validation_dataset
        params = pieces.params
        validation_iterator = pieces.validation_iterator

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)
        mixed_precision = params.pop_bool("mixed_precision", False)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            mixed_precision=mixed_precision)
Example #26
0
    def from_params(
            cls,  # type: ignore
            model: Model,
            serialization_dir: str,
            files_to_archive: Dict[str, str],
            iterator: DataIterator,
            train_data: Iterable[Instance],
            validation_data: Optional[Iterable[Instance]],
            params: Params,
            validation_iterator: DataIterator = None) -> 'TrainerFP16':
        # pylint: disable=arguments-differ
        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)
        fp16 = params.pop_bool("fp16", False)
        dynamic_loss_scale = params.pop_bool("dynamic_loss_scale", True)
        validate_first = params.pop_bool("validate_first", False)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if fp16:
            model.half()
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]

        # If fp16, need to wrap the optimizer
        try:
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if fp16:
            # The FP16_Optimizer we use depends on whether the optimizer is FusedAdam or a regular pytorch optimizer
            if isinstance(optimizer, FusedAdam):
                from apex.optimizers import FP16_Optimizer
            else:
                from apex.fp16_utils import FP16_Optimizer
            optimizer = FP16_Optimizer(optimizer,
                                       dynamic_loss_scale=dynamic_loss_scale)

        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        statistics_interval = params.pop_int("statistics_interval", 5000)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            statistics_interval=statistics_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            fp16=fp16,
            validate_first=validate_first,
            files_to_archive=files_to_archive)
Example #27
0
    def from_params(  # type: ignore
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        local_rank: int = 0,
    ) -> "MetaTrainer":

        from allennlp.training.trainer import Trainer
        from src.training.trainer_pieces import MetaTrainerPieces

        config = dict(as_flat_dict(params.as_dict()))
        pieces = MetaTrainerPieces.from_params(params, serialization_dir, recover)
        model = pieces.model
        serialization_dir = serialization_dir
        iterator = pieces.iterator
        train_datas = pieces.train_datasets
        validation_datas = pieces.validation_datasets
        params = pieces.params
        validation_iterator = pieces.validation_iterator

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters
            )
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if "checkpointer" in params:
            if (
                "keep_serialized_model_every_num_seconds" in params
                or "num_serialized_models_to_keep" in params
            ):
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods."
                )
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None
            )
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds,
            )

        log_grad_norm = params.pop("log_grad_norm", "total")
        save_embedder = params.pop_bool("save_embedder", True)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate", False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        distributed = params.pop_bool("distributed", False)
        world_size = params.pop_int("world_size", 1)

        num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1)
        tasks_per_step = params.pop_int("tasks_per_step", 0)
        wrapper = Wrapper.from_params(
            params.pop("wrapper"),
            model=model,
            meta_optimizer=optimizer,
        )

        task_discriminator_params = params.pop("task_discriminator", None)
        if task_discriminator_params:
            num_tasks = model.vocab.get_vocab_size("lang_labels")
            task_discriminator = TaskDiscriminator.from_params(task_discriminator_params,
                                                               num_tasks=num_tasks)
            if cuda_device >= 0:
                task_discriminator = task_discriminator.cuda(cuda_device)

            discriminator_parameters = \
                [[n, p] for n, p in task_discriminator.named_parameters() if p.requires_grad]
            discriminator_optimizer = Optimizer.from_params(discriminator_parameters,
                                                            params.pop("discriminator_optimizer"))
        else:
            task_discriminator = None
            discriminator_optimizer = None

        writer = None
        wandb_config = params.pop("wandb", None)
        if wandb_config is not None:
            writer = WandBWriter(config, wrapper.container, wandb_config)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_datas,
            validation_datas,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            save_embedder=save_embedder,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
            log_grad_norm=log_grad_norm,
            wrapper=wrapper,
            task_discriminator=task_discriminator,
            discriminator_optimizer=discriminator_optimizer,
            tasks_per_step=tasks_per_step,
            writer=writer,
        )
    def __init__(
        self,
        model: Model,
        optimizer,
        iterator: DataIterator,
        train_dataset: Iterable[Instance],
        validation_dataset: Optional[Iterable[Instance]] = None,
        patience: Optional[int] = None,
        validation_metric: str = "-loss",
        validation_iterator: DataIterator = None,
        shuffle: bool = True,
        num_epochs: int = 20,
        serialization_dir: Optional[str] = None,
        num_serialized_models_to_keep: int = 20,
        keep_serialized_model_every_num_seconds: int = None,
        checkpointer: Checkpointer = None,
        model_save_interval: float = None,
        cuda_device: Union[int, List] = -1,
        grad_norm: Optional[float] = None,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler=None,
        momentum_scheduler=None,
        summary_interval: int = 100,
        histogram_interval: int = None,
        should_log_parameter_statistics: bool = True,
        should_log_learning_rate: bool = False,
        log_batch_size_period: Optional[int] = None,
        moving_average=None,
    ) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. You can also pass in a validation
        dataset and enable early stopping. There are many other bells and whistles as well.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        iterator : ``DataIterator``, required.
            A method for iterating over a ``Dataset``, yielding padded indexed batches.
        train_dataset : ``Dataset``, required.
            A ``Dataset`` to train on. The dataset should have already been indexed.
        validation_dataset : ``Dataset``, optional, (default = None).
            A ``Dataset`` to evaluate on. The dataset should have already been indexed.
        patience : Optional[int] > 0, optional (default=None)
            Number of epochs to be patient before early stopping: the training is stopped
            after ``patience`` epochs with no improvement. If given, it must be ``> 0``.
            If None, early stopping is disabled.
        validation_metric : str, optional (default="loss")
            Validation metric to measure for whether to stop training using patience
            and whether to serialize an ``is_best`` model each epoch. The metric name
            must be prepended with either "+" or "-", which specifies whether the metric
            is an increasing or decreasing function.
        validation_iterator : ``DataIterator``, optional (default=None)
            An iterator to use for the validation set.  If ``None``, then
            use the training `iterator`.
        shuffle: ``bool``, optional (default=True)
            Whether to shuffle the instances in the iterator or not.
        num_epochs : int, optional (default = 20)
            Number of training epochs.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        num_serialized_models_to_keep : ``int``, optional (default=20)
            Number of previous model checkpoints to retain.  Default is to keep 20 checkpoints.
            A value of None or -1 means all checkpoints will be kept.
        keep_serialized_model_every_num_seconds : ``int``, optional (default=None)
            If num_serialized_models_to_keep is not None, then occasionally it's useful to
            save models at a given interval in addition to the last num_serialized_models_to_keep.
            To do so, specify keep_serialized_model_every_num_seconds as the number of seconds
            between permanently saved checkpoints.  Note that this option is only used if
            num_serialized_models_to_keep is not None, otherwise all checkpoints are kept.
        checkpointer : ``Checkpointer``, optional (default=None)
            An instance of class Checkpointer to use instead of the default. If a checkpointer is specified,
            the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should
            not be specified. The caller is responsible for initializing the checkpointer so that it is
            consistent with serialization_dir.
        model_save_interval : ``float``, optional (default=None)
            If provided, then serialize models every ``model_save_interval``
            seconds within single epochs.  In all cases, models are also saved
            at the end of every epoch if ``serialization_dir`` is provided.
        cuda_device : ``Union[int, List[int]]``, optional (default = -1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
        grad_norm : ``float``, optional, (default = None).
            If provided, gradient norms will be rescaled to have a maximum of this value.
        grad_clipping : ``float``, optional (default = ``None``).
            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
            maximum of this value.  If you are getting ``NaNs`` in your gradients during training
            that are not solved by using ``grad_norm``, you may need this.
        learning_rate_scheduler : ``LearningRateScheduler``, optional (default = None)
            If specified, the learning rate will be decayed with respect to
            this schedule at the end of each epoch (or batch, if the scheduler implements
            the ``step_batch`` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`,
            this will use the ``validation_metric`` provided to determine if learning has plateaued.
            To support updating the learning rate on every batch, this can optionally implement
            ``step_batch(batch_num_total)`` which updates the learning rate given the batch number.
        momentum_scheduler : ``MomentumScheduler``, optional (default = None)
            If specified, the momentum will be updated at the end of each batch or epoch
            according to the schedule.
        summary_interval: ``int``, optional, (default = 100)
            Number of batches between logging scalars to tensorboard
        histogram_interval : ``int``, optional, (default = ``None``)
            If not None, then log histograms to tensorboard every ``histogram_interval`` batches.
            When this parameter is specified, the following additional logging is enabled:
                * Histograms of model parameters
                * The ratio of parameter update norm to parameter norm
                * Histogram of layer activations
            We log histograms of the parameters returned by
            ``model.get_parameters_for_histogram_tensorboard_logging``.
            The layer activations are logged for any modules in the ``Model`` that have
            the attribute ``should_log_activations`` set to ``True``.  Logging
            histograms requires a number of GPU-CPU copies during training and is typically
            slow, so we recommend logging histograms relatively infrequently.
            Note: only Modules that return tensors, tuples of tensors or dicts
            with tensors as values currently support activation logging.
        should_log_parameter_statistics : ``bool``, optional, (default = True)
            Whether to send parameter statistics (mean and standard deviation
            of parameters and gradients) to tensorboard.
        should_log_learning_rate : ``bool``, optional, (default = False)
            Whether to send parameter specific learning rate to tensorboard.
        log_batch_size_period : ``int``, optional, (default = ``None``)
            If defined, how often to log the average batch size.
        moving_average: ``MovingAverage``, optional, (default = None)
            If provided, we will maintain moving averages for all parameters. During training, we
            employ a shadow variable for each parameter, which maintains the moving average. During
            evaluation, we backup the original parameters and assign the moving averages to corresponding
            parameters. Be careful that when saving the checkpoint, we will save the moving averages of
            parameters. This is necessary because we want the saved model to perform as well as the validated
            model if we load it later. But this may cause problems if you restart the training from checkpoint.
        """
        super().__init__(serialization_dir, cuda_device)

        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model

        self.iterator = iterator
        self._validation_iterator = validation_iterator
        self.shuffle = shuffle

        optimiser_params = optimizer
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]

        from copy import deepcopy

        self.optimizer = Optimizer.from_params(parameters,
                                               deepcopy(optimiser_params))
        self.optimizer_lang1 = Optimizer.from_params(
            parameters, deepcopy(optimiser_params))
        self.optimizer_lang2 = Optimizer.from_params(
            parameters, deepcopy(optimiser_params))
        self.optimizer_cm = Optimizer.from_params(parameters,
                                                  deepcopy(optimiser_params))

        self.train_data = train_dataset
        self._validation_data = validation_dataset

        if patience is None:  # no early stopping
            if validation_dataset:
                logger.warning(
                    "You provided a validation dataset but patience was set to None, "
                    "meaning that early stopping is disabled")
        elif (not isinstance(patience, int)) or patience <= 0:
            raise ConfigurationError(
                '{} is an invalid value for "patience": it must be a positive integer '
                "or None (if you want to disable early stopping)".format(
                    patience))

        # For tracking is_best_so_far and should_stop_early
        self._metric_tracker = MetricTracker(patience, validation_metric)
        # Get rid of + or -
        self._validation_metric = validation_metric[1:]

        self._num_epochs = num_epochs

        if checkpointer is not None:
            # We can't easily check if these parameters were passed in, so check against their default values.
            # We don't check against serialization_dir since it is also used by the parent class.
            if (num_serialized_models_to_keep != 20
                    or keep_serialized_model_every_num_seconds is not None):
                raise ConfigurationError(
                    "When passing a custom Checkpointer, you may not also pass in separate checkpointer "
                    "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'."
                )
            self._checkpointer = checkpointer
        else:
            self._checkpointer = Checkpointer(
                serialization_dir,
                keep_serialized_model_every_num_seconds,
                num_serialized_models_to_keep,
            )

        self._model_save_interval = model_save_interval

        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping

        if learning_rate_scheduler:
            self._learning_rate_scheduler = LearningRateScheduler.from_params(
                self.optimizer, deepcopy(learning_rate_scheduler))
            self._learning_rate_scheduler_lang1 = LearningRateScheduler.from_params(
                self.optimizer_lang1, deepcopy(learning_rate_scheduler))
            self._learning_rate_scheduler_lang2 = LearningRateScheduler.from_params(
                self.optimizer_lang2, deepcopy(learning_rate_scheduler))
            self._learning_rate_scheduler_cm = LearningRateScheduler.from_params(
                self.optimizer_cm, deepcopy(learning_rate_scheduler))
        else:
            self._learning_rate_scheduler, self._learning_rate_scheduler_lang1, self._learning_rate_scheduler_lang2, self._learning_rate_scheduler_cm = None, None, None, None

        if momentum_scheduler:
            self._momentum_scheduler = MomentumScheduler.from_params(
                self.optimizer, deepcopy(momentum_scheduler))
            self._momentum_scheduler_lang1 = MomentumScheduler.from_params(
                self.optimizer_lang1, deepcopy(momentum_scheduler))
            self._momentum_scheduler_lang2 = MomentumScheduler.from_params(
                self.optimizer_lang2, deepcopy(momentum_scheduler))
            self._momentum_scheduler_cm = MomentumScheduler.from_params(
                self.optimizer_lang3, deepcopy(momentum_scheduler))
        else:
            self._momentum_scheduler, self._momentum_scheduler_lang1, self._momentum_scheduler_lang2, self._momentum_scheduler_cm = None, None, None, None

        # WE HAVE NOT USED IT YET
        self._moving_average, self.moving_average_lang1, self.moving_average_lang2, self.moving_average_cm = None, None, None, None

        # We keep the total batch number as an instance variable because it
        # is used inside a closure for the hook which logs activations in
        # ``_enable_activation_logging``.
        self._batch_num_total = 0

        self._tensorboard = TensorboardWriter(
            get_batch_num_total=lambda: self._batch_num_total,
            serialization_dir=serialization_dir,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
        )

        self._log_batch_size_period = log_batch_size_period

        self._last_log = 0.0  # time of last logging

        # Enable activation logging.
        if histogram_interval is not None:
            self._tensorboard.enable_activation_logging(self.model)
Example #29
0
def main(args):
    params = Params.from_file(args.params_file)

    # print('Data seed:{}, Percent data: {}'.format(shuffle_id, train_size))
    settings.cuda = params['cuda_device'] != -1
    common_util.prepare_environment(params)

    serialization_dir = params['serialization_dir']
    training_util.create_serialization_dir(params, serialization_dir,
                                           args.recover, args.force)
    common_util.prepare_global_logging(serialization_dir, True)
    logging.info(
        "torch version: {}, allennlp version: {}, allennlp path: {}".format(
            torch.__version__, allennlp.__version__, allennlp.__path__))
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    semi_supervision = params.get('semi_supervised', False)
    which_mixer = params.get('which_mixer', 'cm')
    dd_warmup_iters = params.pop('dd_warmup_iters', 1)
    dd_semi_warmup_iters = params.pop('dd_semi_warmup_iters', 1)
    dd_update_freq = params.pop('dd_update_freq', 2)
    constraints_wt = params.get('constraints_wt', 0)
    calc_valid_freq = params.get('calc_valid_freq', 1)
    backprop_after_xbatches = params.pop('backprop_after_xbatches', 1)
    min_pct_of_unlabelled = params.pop('min_pct_of_unlabelled', 0.0)
    dd_increase_freq_after = params.pop('dd_increase_freq_after', 0)
    dd_increase_freq_by = params.pop('dd_increase_freq_by', 0)
    dd_decay_lr = params.pop('dd_decay_lr', 0)
    dd_decay_lr_after = params.pop('dd_decay_lr_after', 1.0)
    grad_norm_before_warmup = params.pop('grad_norm_before_warmup', 0)
    if semi_supervision:
        print("Semi Supervision On")

    for key in [
            'warmup_epochs', 'unlabelled_train_data_file', 'test_data_file',
            'data_dir', 'cuda_device', 'serialization_dir', 'train_data_file',
            'validation_data_file', 'constraints_wt', 'train_size',
            'shuffle_id', 'semi_supervised', 'which_mixer',
            'distributed_lambda_update', 'calc_valid_freq'
    ]:
        params.pop(key, None)

    print("Trainer pieces")
    pieces = gan_trainer.TrainerPiecesForSemi.from_params(
        params, serialization_dir, args.recover, semi_supervision)  # pylint: disable=no-member

    #pieces for constrained learning"
    print("Constraint model")
    constraints_model = Model.from_params(vocab=pieces.model.vocab,
                                          params=params.pop('dd_constraints'))
    dd_params = [[n, p] for n, p in constraints_model.named_parameters()
                 if p.requires_grad]
    dd_optimizer = None
    dd_optim_params = params.pop('dd_optimizer', None)
    if len(dd_params) > 0:
        dd_optimizer = Optimizer.from_params(dd_params, dd_optim_params)

    cp = None
    chfile = None
    #Pdb().set_trace()
    if args.weight_dir is not None:
        #Pdb().set_trace()
        flag = True
        if args.weight_file is not None:
            logging.info("Loading  Model weights from :{}".format(
                os.path.join(args.weight_dir, args.weight_file)))
            model_states = torch.load(
                os.path.join(args.weight_dir, args.weight_file))
            pieces.model.load_state_dict(model_states)
            flag = False
        if args.dd_file is not None:
            logging.info("Loading Constraint Model from :{}".format(
                os.path.join(args.weight_dir, args.dd_file)))
            flag = False
            chfile = os.path.join(args.weight_dir, args.dd_file)
            # cp = torch.load(chfile)
            # constraints_model.load_state_dict(cp['constraints_model'])
            # if 'dd_update_freq' in cp:
            #     dd_update_freq  = cp['dd_update_freq']
            #     print("New dd_update_freq:" , dd_update_freq)

        if flag:
            raise (
                "why provide args.weight_dir? when both weight_file and dd_file are None"
            )
    print("Trainer")
    trainer = Trainer.from_params(
        model=pieces.model,
        serialization_dir=serialization_dir,
        iterator=pieces.iterator,
        train_data=pieces.train_dataset,
        validation_data=pieces.validation_dataset,
        params=pieces.params,
        validation_iterator=pieces.validation_iterator)

    if args.weight_dir is not None and args.training_state_file is not None:
        logging.info("Loading Training state from :{}".format(
            os.path.join(args.weight_dir, args.training_state_file)))
        training_state = torch.load(
            os.path.join(args.weight_dir, args.training_state_file))
        trainer.optimizer.load_state_dict(training_state["optimizer"])

    params.assert_empty('base train command')

    try:
        #if backprop_after_xbatches == 1:
        #    print("Training setup")
        #    semi_trainer= gan_trainer.SemiSupervisedTrainer(trainer, constraints_model, dd_optimizer, pieces.validation_iterator,  pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq, constraints_wt, calc_valid_freq)
        #    print("Training start")
        #    metrics = semi_trainer.custom_train()
        #else:
        print("Training setup")
        semi_trainer = gan_trainer.SemiSupervisedTrainer(
            trainer,
            constraints_model,
            dd_optimizer,
            pieces.validation_iterator,
            pieces.unlabelled_dataset,
            semi_supervision,
            which_mixer,
            dd_warmup_iters,
            dd_update_freq,
            constraints_wt,
            calc_valid_freq,
            backprop_after_xbatches,
            min_pct_of_unlabelled,
            dd_semi_warmup_iters,
            dd_increase_freq_after,
            dd_increase_freq_by,
            dd_decay_lr,
            args.debug,
            chfile=chfile,
            shuffle=args.shuffle,
            dd_decay_lr_after=dd_decay_lr_after,
            grad_norm_before_warmup=grad_norm_before_warmup)

        print("Training start")
        #print(yatin)
        metrics = semi_trainer.custom_train()

    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    common_util.dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                             metrics,
                             log=True)
 def _get_optimizer(self, lr: float = 1.0):
     return Optimizer.from_params(self.model.named_parameters(), Params({"type": "sgd", "lr": lr}))
Example #31
0
    def train(self,
              tasks,
              validation_interval,
              n_batches_per_pass,
              weighting_method,
              scaling_method,
              train_params,
              optimizer_params,
              scheduler_params,
              shared_optimizer=0,
              load_model=1):

        iterator = self._iterator
        task_infos, metric_infos = self._setup_training(
            tasks, train_params, optimizer_params, scheduler_params, iterator)
        if shared_optimizer:
            g_optimizer = Optimizer.from_params(
                train_params, copy.deepcopy(optimizer_params))
            g_scheduler = LearningRateScheduler.from_params(
                g_optimizer, copy.deepcopy(scheduler_params))
        else:
            g_optimizer, g_scheduler = None, None
        self._g_optimizer = g_optimizer
        self._g_scheduler = g_scheduler

        n_pass, should_stop = 0, False  # define these here b/c they might get overridden on load
        if self._serialization_dir is not None:  # Resume from serialization path
            if load_model \
                    and any(["model_state_epoch_" in x for x in os.listdir(self._serialization_dir)]):
                n_pass, should_stop = self._restore_checkpoint()
                logger.info(
                    "Loaded model from checkpoint. Starting at pass %d",
                    n_pass)

        if self._grad_clipping is not None:  # pylint: disable=invalid-unary-operand-type
            clip_function = lambda grad: grad.clamp(-self._grad_clipping, self.
                                                    _grad_clipping)
            for parameter in self._model.parameters():
                if parameter.requires_grad:
                    parameter.register_hook(clip_function)

        if weighting_method == 'uniform':
            sample_weights = [1] * len(tasks)
        elif weighting_method == 'proportional':
            sample_weights = [
                task_infos[task.name]['n_tr_batches'] for task in tasks
            ]
            max_weight = max(sample_weights)
            min_weight = min(sample_weights)
        samples = random.choices(tasks,
                                 weights=sample_weights,
                                 k=validation_interval)

        logger.info("Beginning training.")
        all_tr_metrics = {}
        while not should_stop:
            self._model.train()

            # randomly select a task
            task = samples[n_pass % (validation_interval)]
            task_info = task_infos[task.name]
            if task_info['stopped']:
                continue
            tr_generator = task_info['tr_generator']
            optimizer = g_optimizer if shared_optimizer else task_info[
                'optimizer']
            total_batches_trained = task_info['total_batches_trained']
            n_batches_since_val = task_info['n_batches_since_val']
            tr_loss = task_info['loss']
            for batch in itertools.islice(tr_generator, n_batches_per_pass):
                n_batches_since_val += 1
                total_batches_trained += 1
                optimizer.zero_grad()
                output_dict = self._forward(batch,
                                            task=task,
                                            for_training=True)
                assert "loss" in output_dict, "Model must return a dict containing a 'loss' key"
                loss = output_dict["loss"]  # optionally scale loss
                if scaling_method == 'unit' and weighting_method == 'proportional':
                    loss /= task_info['n_tr_batches']
                elif scaling_method == 'max' and weighting_method == 'proportional':
                    loss *= (max_weight / task_info['n_tr_batches'])
                elif scaling_method == 'min' and weighting_method == 'proportional':
                    loss *= (min_weight / task_info['n_tr_batches'])
                loss.backward()
                tr_loss += loss.data.cpu().numpy()

                # Gradient regularization and application
                if self._grad_norm:
                    clip_grad_norm(self._model.parameters(), self._grad_norm)
                optimizer.step()

                n_pass += 1  # update per batch

            # Update training progress on that task
            task_info['n_batches_since_val'] = n_batches_since_val
            task_info['total_batches_trained'] = total_batches_trained
            task_info['loss'] = tr_loss

            # Intermediate logging
            if time.time() - task_info['last_log'] > self._log_interval:
                task_metrics = task.get_metrics()
                task_metrics["%s_loss" %
                             task.name] = tr_loss / n_batches_since_val
                description = self._description_from_metrics(task_metrics)
                logger.info("Update %d: task %s, batch %d (%d): %s", n_pass,
                            task.name, n_batches_since_val,
                            total_batches_trained, description)
                task_info['last_log'] = time.time()

            # Validation
            if n_pass % (validation_interval) == 0:
                epoch = int(n_pass / validation_interval)
                logger.info("***** Pass %d / Epoch %d *****", n_pass, epoch)
                # Get metrics for all training progress so far
                for task in tasks:
                    task_info = task_infos[task.name]
                    n_batches_since_val = task_info['n_batches_since_val']
                    if n_batches_since_val > 0:
                        task_metrics = task.get_metrics(reset=True)
                        for name, value in task_metrics.items():
                            all_tr_metrics["%s_%s" % (task.name, name)] = value
                        all_tr_metrics["%s_loss" % task.name] = \
                                float(task_info['loss'] / n_batches_since_val)
                    else:
                        all_tr_metrics["%s_loss" % task.name] = 0.0
                    logger.info(
                        "%s: trained on %d batches, %.3f epochs", task.name,
                        n_batches_since_val,
                        n_batches_since_val / task_info['n_tr_batches'])

                # Validate
                logger.info("Validating...")
                all_val_metrics, should_save, task_infos, metric_infos = \
                        self._validate(epoch, tasks, task_infos, metric_infos, iterator, g_scheduler)

                # Check stopping conditions
                should_stop, task_infos, metric_infos = \
                        self._check_stop(epoch, tasks, task_infos, metric_infos, g_optimizer)

                # Log results
                for name, value in all_val_metrics.items():
                    logger.info("Statistic: %s", name)
                    if name in all_tr_metrics:
                        logger.info("\ttraining: %3f", all_tr_metrics[name])
                    logger.info("\tvalidation: %3f", value)

                self._metric_infos = metric_infos
                self._task_infos = task_infos
                all_tr_metrics = {}
                samples = random.choices(tasks,
                                         weights=sample_weights,
                                         k=validation_interval)

                if should_save:
                    self._save_checkpoint({
                        "epoch": epoch,
                        "should_stop": should_stop
                    })

        logging.info('Stopped training after %d validation checks',
                     n_pass / validation_interval)
        return self._aggregate_results(tasks, task_infos,
                                       metric_infos)  #, validation_interval)
 def _get_optimizer(self, lr: float = 1.0):
     optimizer_params = Params({"type": "sgd", "lr": lr})
     optimizer_params["parameter_groups"] = [[[f"^{m}"], {}] for m in self.model._modules]
     return Optimizer.from_params(self.model.named_parameters(), optimizer_params)
Example #33
0
    def from_params(
            cls,  # type: ignore
            model: Model,
            serialization_dir: str,
            iterator: DataIterator,
            train_data: Iterable[Instance],
            validation_data: Optional[Iterable[Instance]],
            params: Params,
            validation_iterator: DataIterator = None) -> 'Trainer':
        # pylint: disable=arguments-differ
        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        if lr_scheduler_params:
            scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            scheduler = None

        num_serialized_models_to_keep = params.pop_int(
            "num_serialized_models_to_keep", 20)
        keep_serialized_model_every_num_seconds = params.pop_int(
            "keep_serialized_model_every_num_seconds", None)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=scheduler,
            num_serialized_models_to_keep=num_serialized_models_to_keep,
            keep_serialized_model_every_num_seconds=
            keep_serialized_model_every_num_seconds,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period)
 def test_no_metric_wrapper_can_support_none_for_metrics(self):
     model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     lrs = LearningRateScheduler.from_params(Optimizer.from_params(model.named_parameters(),
                                                                   Params({"type": "adam"})),
                                             Params({"type": "step", "step_size": 1}))
     lrs.step(None, None)
 def test_reduce_on_plateau_works_when_metrics_exist(self):
     model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     LearningRateScheduler.from_params(
         Optimizer.from_params(model.named_parameters(),
                               Params({"type": "adam"})),
         Params({"type": "reduce_on_plateau"})).step(10, None)
Example #36
0
    def from_params(cls,
                    model: Model,
                    serialization_dir: str,
                    iterator: DataIterator,
                    train_data: Iterable[Instance],
                    validation_data: Optional[Iterable[Instance]],
                    params: Params,
                    validation_iterator: DataIterator = None) -> 'Trainer':

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = params.pop_int("cuda_device", -1)
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)

        if cuda_device >= 0:
            model = model.cuda(cuda_device)
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        if lr_scheduler_params:
            scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            scheduler = None

        num_serialized_models_to_keep = params.pop_int(
            "num_serialized_models_to_keep", 20)
        keep_serialized_model_every_num_seconds = params.pop_int(
            "keep_serialized_model_every_num_seconds", None)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)

        params.assert_empty(cls.__name__)
        return Trainer(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=scheduler,
            num_serialized_models_to_keep=num_serialized_models_to_keep,
            keep_serialized_model_every_num_seconds=
            keep_serialized_model_every_num_seconds,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval)
Example #37
0
    def from_params(  # type: ignore
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        cache_directory: str = None,
        cache_prefix: str = None,
    ) -> "CallbackTrainer":
        pieces = TrainerPieces.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        model = pieces.model
        params = pieces.params
        validation_iterator = pieces.validation_iterator or pieces.iterator

        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        callbacks_params = params.pop("callbacks", [])
        callbacks: List[Callback] = [
            Callback.from_params(
                params=callback_params,
                model=model,
                optimizer=optimizer,
                instances=pieces.train_dataset,
                iterator=pieces.iterator,
                shuffle=shuffle,
                validation_data=pieces.validation_dataset,
                validation_iterator=validation_iterator,
                serialization_dir=serialization_dir,
            ) for callback_params in callbacks_params
        ]

        distributed = params.pop_bool("distributed", False)
        world_size = params.pop_int("world_size", 1)

        if distributed:
            rank = cuda_device
        else:
            rank = 0

        params.assert_empty(cls.__name__)
        return cls(
            model,
            pieces.train_dataset,
            pieces.iterator,
            optimizer,
            num_epochs=num_epochs,
            shuffle=shuffle,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            callbacks=callbacks,
            distributed=distributed,
            rank=rank,
            world_size=world_size,
        )
Example #38
0
def _from_params(
        cls,  # type: ignore
        model: Model,
        serialization_dir: str,
        iterator: DataIterator,
        train_data: Iterable[Instance],
        validation_data: Optional[Iterable[Instance]],
        params: Params,
        validation_iterator: DataIterator = None) -> DecompTrainer:
    # pylint: disable=arguments-differ
    patience = params.pop_int("patience", None)
    validation_metric = params.pop("validation_metric", "-loss")
    shuffle = params.pop_bool("shuffle", True)

    num_epochs = params.pop_int("num_epochs", 20)

    cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
    grad_norm = params.pop_float("grad_norm", None)
    grad_clipping = params.pop_float("grad_clipping", None)
    lr_scheduler_params = params.pop("learning_rate_scheduler", None)
    momentum_scheduler_params = params.pop("momentum_scheduler", None)

    validation_data_path = params.pop("validation_data_path", None)
    validation_prediction_path = params.pop("validation_prediction_path", None)

    semantics_only = params.pop("semantics_only", False)
    drop_syntax = params.pop("drop_syntax", True)
    include_attribute_scores = params.pop("include_attribute_scores", False)

    warmup_epochs = params.pop("warmup_epochs", 0)

    if isinstance(cuda_device, list):
        model_device = cuda_device[0]
    else:
        model_device = cuda_device
    if model_device >= 0:
        # Moving model to GPU here so that the optimizer state gets constructed on
        # the right device.
        model = model.cuda(model_device)

    bert_optim_params = params.pop("bert_optimizer", None)
    bert_name = "_bert_encoder"

    if bert_optim_params is not None:
        tune_after_layer_num = params.pop("bert_tune_layer", 12)

        frozen_regex_str = [
            "(_bert_encoder\.bert_model\.embeddings.*)",
            "(_bert_encoder\.bert_model\.pooler.*)"
        ]
        tune_regex_str = []
        for i in range(0, 12):
            # match all numbers greater than layer num via disjunction
            tune_regex_one = f"({bert_name}\.bert_model\.encoder\.layer\.{i}\..*)"
            if i >= tune_after_layer_num:
                tune_regex_str.append(tune_regex_one)
            else:
                frozen_regex_str.append(tune_regex_one)
        tune_regex = re.compile("|".join(tune_regex_str))
        frozen_regex = re.compile("|".join(frozen_regex_str))
        # decide which params require grad for which optimizer
        all_names = [n for n, p in model.named_parameters()]
        tune_bert_names = [
            n for n in all_names if tune_regex.match(n) is not None
        ]
        frozen_names = [
            n for n in all_names if frozen_regex.match(n) is not None
        ]
        # assert that they're disjoint
        assert (len(set(frozen_names) & set(tune_bert_names)) == 0)
        # set tunable params to require gradient, frozen ones to not require
        for i, (n, p) in enumerate(model.named_parameters()):
            if n in frozen_names:
                p.requires_grad = False
            else:
                p.requires_grad = True

        # extract BERT
        bert_params = [[n, p] for n, p in model.named_parameters()
                       if p.requires_grad and n in tune_bert_names]
        # make sure this matches the tuneable bert params
        assert ([x[0] for x in bert_params] == tune_bert_names)
        bert_optimizer = Optimizer.from_params(bert_params, bert_optim_params)
    else:
        # freeze all BERT params
        tune_bert_names = []
        bert_optimizer = None
        for i, (n, p) in enumerate(model.named_parameters()):
            if "_bert_encoder" in n:
                p.requires_grad = False

    # model params
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad and n not in tune_bert_names]
    optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
    if "moving_average" in params:
        moving_average = MovingAverage.from_params(
            params.pop("moving_average"), parameters=parameters)
    else:
        moving_average = None

    if lr_scheduler_params:
        lr_scheduler = LearningRateScheduler.from_params(
            optimizer, lr_scheduler_params)
    else:
        lr_scheduler = None
    if momentum_scheduler_params:
        momentum_scheduler = MomentumScheduler.from_params(
            optimizer, momentum_scheduler_params)
    else:
        momentum_scheduler = None

    if 'checkpointer' in params:
        if 'keep_serialized_model_every_num_seconds' in params or \
                'num_serialized_models_to_keep' in params:
            raise ConfigurationError(
                "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                " but the passed config uses both methods.")
        checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
    else:
        num_serialized_models_to_keep = params.pop_int(
            "num_serialized_models_to_keep", 20)
        keep_serialized_model_every_num_seconds = params.pop_int(
            "keep_serialized_model_every_num_seconds", None)
        checkpointer = Checkpointer(
            serialization_dir=serialization_dir,
            num_serialized_models_to_keep=num_serialized_models_to_keep,
            keep_serialized_model_every_num_seconds=
            keep_serialized_model_every_num_seconds)
    model_save_interval = params.pop_float("model_save_interval", None)
    summary_interval = params.pop_int("summary_interval", 100)
    histogram_interval = params.pop_int("histogram_interval", None)
    should_log_parameter_statistics = params.pop_bool(
        "should_log_parameter_statistics", True)
    should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                               False)
    log_batch_size_period = params.pop_int("log_batch_size_period", None)
    syntactic_method = params.pop("syntactic_method", None)
    accumulate_batches = params.pop("accumulate_batches", 1)

    params.assert_empty(cls.__name__)
    return cls(model=model,
               optimizer=optimizer,
               bert_optimizer=bert_optimizer,
               iterator=iterator,
               train_dataset=train_data,
               validation_dataset=validation_data,
               validation_data_path=validation_data_path,
               validation_prediction_path=validation_prediction_path,
               semantics_only=semantics_only,
               warmup_epochs=warmup_epochs,
               syntactic_method=syntactic_method,
               drop_syntax=drop_syntax,
               include_attribute_scores=include_attribute_scores,
               patience=patience,
               validation_metric=validation_metric,
               validation_iterator=validation_iterator,
               shuffle=shuffle,
               num_epochs=num_epochs,
               serialization_dir=serialization_dir,
               cuda_device=cuda_device,
               grad_norm=grad_norm,
               grad_clipping=grad_clipping,
               learning_rate_scheduler=lr_scheduler,
               momentum_scheduler=momentum_scheduler,
               checkpointer=checkpointer,
               model_save_interval=model_save_interval,
               summary_interval=summary_interval,
               histogram_interval=histogram_interval,
               should_log_parameter_statistics=should_log_parameter_statistics,
               should_log_learning_rate=should_log_learning_rate,
               log_batch_size_period=log_batch_size_period,
               moving_average=moving_average,
               accumulate_batches=accumulate_batches)
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None) -> 'Trainer':
        # pylint: disable=arguments-differ
        # We have to call TrainerPieces.from_params since we are using our own Trainer
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)

        model = pieces.model
        serialization_dir = serialization_dir
        iterator = pieces.iterator
        train_data = pieces.train_dataset
        validation_data = pieces.validation_dataset
        validation_iterator = pieces.validation_iterator
        params = pieces.params

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        accumulation_steps = params.pop("accumulation_steps", 0)
        opt_level = params.pop("opt_level", "O1")
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)
        half_precision = params.pop("half_precision", False)
        warmup_proportion = params.pop("warmup_proportion", None)
        pretrained_model = params.pop("pretrained_model", None)

        if pretrained_model:
            logger.info('Loading pretrained model from', pretrained_model)
            model = load_archive(pretrained_model).model
            model._discriminative_loss_weight = 1  # TODO: fix this hack

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
              'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            accumulation_steps=accumulation_steps,
            opt_level=opt_level,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            half_precision=half_precision,
            warmup_proportion=warmup_proportion)
Example #40
0
    def from_params(  # type: ignore
        cls,
        model: Model,
        serialization_dir: str,
        iterator: DataIterator,
        train_data: Iterable[Instance],
        validation_data: Optional[Iterable[Instance]],
        params: Params,
        validation_iterator: DataIterator = None,
    ) -> "Trainer":

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if "checkpointer" in params:
            if "keep_serialized_model_every_num_seconds" in params \
                    or "num_serialized_models_to_keep" in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds,
            )
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
        )
Example #41
0
 def test_no_metric_wrapper_can_support_none_for_metrics(self):
     model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     lrs = LearningRateScheduler.from_params(Optimizer.from_params(model.named_parameters(),
                                                                   Params({"type": "adam"})),
                                             Params({"type": "step", "step_size": 1}))
     lrs.step(None, None)