Example #1
0
    def load_checkpoint(self) -> Optional[TrainerCheckpoint]:
        """
        Loads model state from a `serialization_dir` corresponding to the last saved checkpoint.
        This includes a training state, which is serialized separately from model parameters. This function
        should only be used to continue training - if you wish to load a model for inference/load parts
        of a model into a new computation graph, you should use the native Pytorch functions:
        `model.load_state_dict(torch.load("/path/to/model/weights.th"))`

        If `self._serialization_dir` does not exist or does not contain any checkpointed weights,
        this function will do nothing and return empty dicts.

        # Returns

        states : `Tuple[Dict[str, Any], Dict[str, Any]]`
            The model state and the training state.
        """
        latest_checkpoint = self.find_latest_checkpoint()
        if latest_checkpoint is None:
            return None

        model_path, training_state_path = latest_checkpoint

        # Load the parameters onto CPU, then transfer to GPU.
        # This avoids potential OOM on GPU for large models that
        # load parameters onto GPU then make a new GPU copy into the parameter
        # buffer. The GPU transfer happens implicitly in load_state_dict.
        model_state = torch.load(model_path,
                                 map_location=nn_util.device_mapping(-1))
        training_state = torch.load(training_state_path,
                                    map_location=nn_util.device_mapping(-1))
        return TrainerCheckpoint(model_state, training_state)
Example #2
0
    def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        Restores a model from a serialization_dir to the last saved checkpoint.
        This includes a training state (typically consisting of an epoch count and optimizer state),
        which is serialized separately from  model parameters. This function should only be used to
        continue training - if you wish to load a model for inference/load parts of a model into a new
        computation graph, you should use the native Pytorch functions:
        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``
        If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
        this function will do nothing and return empty dicts.
        Returns
        -------
        states: Tuple[Dict[str, Any], Dict[str, Any]]
            The model state and the training state.
        """
        latest_checkpoint = self.find_latest_checkpoint()

        if latest_checkpoint is None:
            # No checkpoint to restore, start at 0
            return {}, {}

        model_path, training_state_path = latest_checkpoint

        # Load the parameters onto CPU, then transfer to GPU.
        # This avoids potential OOM on GPU for large models that
        # load parameters onto GPU then make a new GPU copy into the parameter
        # buffer. The GPU transfer happens implicitly in load_state_dict.
        model_state = torch.load(model_path,
                                 map_location=nn_util.device_mapping(-1))
        training_state = torch.load(training_state_path,
                                    map_location=nn_util.device_mapping(-1))
        return model_state, training_state
Example #3
0
def restore_checkpoint(model, optimizer, serialization_dir, learning_rate_scheduler=None):
    """
    Restores a model from a serialization_dir to the last saved checkpoint.
    This includes an epoch count and optimizer state, which is serialized separately
    from  model parameters. This function should only be used to continue training -
    if you wish to load a model for inference/load parts of a model into a new
    computation graph, you should use the native Pytorch functions:
    `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``
    If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
    this function will do nothing and return 0.
    Returns
    -------
    epoch: int
        The epoch at which to resume training, which should be one after the epoch
        in the saved training state.
    """
    latest_checkpoint = find_latest_checkpoint(serialization_dir)

    if latest_checkpoint is None:
        # No checkpoint to restore, start at 0
        return 0, []

    model_path, training_state_path = latest_checkpoint

    # Load the parameters onto CPU, then transfer to GPU.
    # This avoids potential OOM on GPU for large models that
    # load parameters onto GPU then make a new GPU copy into the parameter
    # buffer. The GPU transfer happens implicitly in load_state_dict.
    model_state = torch.load(model_path, map_location=device_mapping(-1))
    training_state = torch.load(training_state_path, map_location=device_mapping(-1))
    if isinstance(model, DataParallel):
        model.module.load_state_dict(model_state)
    else:
        model.load_state_dict(model_state)

    # idk this is always bad luck for me
    optimizer.load_state_dict(training_state["optimizer"])

    if learning_rate_scheduler is not None and "learning_rate_scheduler" in training_state:
        learning_rate_scheduler.lr_scheduler.load_state_dict(
            training_state["learning_rate_scheduler"])
    move_optimizer_to_cuda(optimizer)

    # We didn't used to save `validation_metric_per_epoch`, so we can't assume
    # that it's part of the trainer state. If it's not there, an empty list is all
    # we can do.
    if "val_metric_per_epoch" not in training_state:
        print("trainer state `val_metric_per_epoch` not found, using empty list")
        val_metric_per_epoch: []
    else:
        val_metric_per_epoch = training_state["val_metric_per_epoch"]

    if isinstance(training_state["epoch"], int):
        epoch_to_return = training_state["epoch"] + 1
    else:
        epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1
    return epoch_to_return, val_metric_per_epoch
Example #4
0
    def _restore_checkpoint(self) -> Tuple[int, List[float]]:
        """
        Restores a model from a serialization_dir to the last saved checkpoint.
        This includes an epoch count and optimizer state, which is serialized separately
        from  model parameters. This function should only be used to continue training -
        if you wish to load a model for inference/load parts of a model into a new
        computation graph, you should use the native Pytorch functions:
        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

        If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
        this function will do nothing and return 0.

        Returns
        -------
        epoch: int
            The epoch at which to resume training, which should be one after the epoch
            in the saved training state.
        """
        latest_checkpoint = self.find_latest_checkpoint()

        if latest_checkpoint is None:
            # No checkpoint to restore, start at 0
            return 0, []

        model_path, training_state_path = latest_checkpoint

        # Load the parameters onto CPU, then transfer to GPU.
        # This avoids potential OOM on GPU for large models that
        # load parameters onto GPU then make a new GPU copy into the parameter
        # buffer. The GPU transfer happens implicitly in load_state_dict.
        model_state = torch.load(model_path, map_location=util.device_mapping(-1))
        training_state = torch.load(training_state_path, map_location=util.device_mapping(-1))
        self._model.load_state_dict(model_state)
        self._optimizer.load_state_dict(training_state["optimizer"])
        move_optimizer_to_cuda(self._optimizer)

        # We didn't used to save `validation_metric_per_epoch`, so we can't assume
        # that it's part of the trainer state. If it's not there, an empty list is all
        # we can do.
        if "val_metric_per_epoch" not in training_state:
            logger.warning("trainer state `val_metric_per_epoch` not found, using empty list")
            val_metric_per_epoch: List[float] = []
        else:
            val_metric_per_epoch = training_state["val_metric_per_epoch"]

        if isinstance(training_state["epoch"], int):
            epoch_to_return = training_state["epoch"] + 1
        else:
            epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1

        # For older checkpoints with batch_num_total missing, default to old behavior where
        # it is unchanged.
        batch_num_total = training_state.get('batch_num_total')
        if batch_num_total is not None:
            self._batch_num_total = batch_num_total

        return epoch_to_return, val_metric_per_epoch
Example #5
0
    def _restore_checkpoint(self) -> Tuple[int, List[float]]:
        """
        Restores a model from a serialization_dir to the last saved checkpoint.
        This includes an epoch count and optimizer state, which is serialized separately
        from  model parameters. This function should only be used to continue training -
        if you wish to load a model for inference/load parts of a model into a new
        computation graph, you should use the native Pytorch functions:
        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

        If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
        this function will do nothing and return 0.

        Returns
        -------
        epoch: int
            The epoch at which to resume training, which should be one after the epoch
            in the saved training state.
        """
        latest_checkpoint = self.find_latest_checkpoint()

        if latest_checkpoint is None:
            # No checkpoint to restore, start at 0
            return 0, []

        model_path, training_state_path = latest_checkpoint

        # Load the parameters onto CPU, then transfer to GPU.
        # This avoids potential OOM on GPU for large models that
        # load parameters onto GPU then make a new GPU copy into the parameter
        # buffer. The GPU transfer happens implicitly in load_state_dict.
        model_state = torch.load(model_path, map_location=util.device_mapping(-1))
        training_state = torch.load(training_state_path, map_location=util.device_mapping(-1))
        self._model.load_state_dict(model_state)
        self._optimizer.load_state_dict(training_state["optimizer"])
        move_optimizer_to_cuda(self._optimizer)

        # We didn't used to save `validation_metric_per_epoch`, so we can't assume
        # that it's part of the trainer state. If it's not there, an empty list is all
        # we can do.
        if "val_metric_per_epoch" not in training_state:
            logger.warning("trainer state `val_metric_per_epoch` not found, using empty list")
            val_metric_per_epoch: List[float] = []
        else:
            val_metric_per_epoch = training_state["val_metric_per_epoch"]

        if isinstance(training_state["epoch"], int):
            epoch_to_return = training_state["epoch"] + 1
        else:
            epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1

        # For older checkpoints with batch_num_total missing, default to old behavior where
        # it is unchanged.
        batch_num_total = training_state.get('batch_num_total')
        if batch_num_total is not None:
            self._batch_num_total = batch_num_total

        return epoch_to_return, val_metric_per_epoch
Example #6
0
    def _restore_checkpoint(self) -> Tuple[int, List[float]]:
        """
        Restores a model from a serialization_dir to the last saved checkpoint.
        This includes an epoch count and optimizer state, which is serialized separately
        from  model parameters. This function should only be used to continue training -
        if you wish to load a model for inference/load parts of a model into a new
        computation graph, you should use the native Pytorch functions:
        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

        If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
        this function will do nothing and return 0.

        Returns
        -------
        epoch: int
            The epoch at which to resume training, which should be one after the epoch
            in the saved training state.
        """
        have_checkpoint = (self._serialization_dir is not None and
                           any("model_state_epoch_" in x for x in os.listdir(self._serialization_dir)))

        if not have_checkpoint:
            # No checkpoint to restore, start at 0
            return 0, []

        serialization_files = os.listdir(self._serialization_dir)
        model_checkpoints = [x for x in serialization_files if "model_state_epoch" in x]
        epoch_to_load = max([int(x.split("model_state_epoch_")[-1].strip(".th")) for x in model_checkpoints])

        model_path = os.path.join(self._serialization_dir,
                                  "model_state_epoch_{}.th".format(epoch_to_load))
        training_state_path = os.path.join(self._serialization_dir,
                                           "training_state_epoch_{}.th".format(epoch_to_load))

        model_state = torch.load(model_path, map_location=util.device_mapping(self._cuda_device))
        training_state = torch.load(training_state_path, map_location=util.device_mapping(self._cuda_device))
        self._model.load_state_dict(model_state)
        self._optimizer.load_state_dict(training_state["optimizer"])

        # We didn't used to save `validation_metric_per_epoch`, so we can't assume
        # that it's part of the trainer state. If it's not there, an empty list is all
        # we can do.
        if "val_metric_per_epoch" not in training_state:
            logger.warning("trainer state `val_metric_per_epoch` not found, using empty list")
            val_metric_per_epoch: List[float] = []
        else:
            val_metric_per_epoch = training_state["val_metric_per_epoch"]

        return training_state["epoch"] + 1, val_metric_per_epoch
Example #7
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #8
0
    def _restore_checkpoint(self) -> int:
        """
        Restores a model from a serialization_dir to the last saved checkpoint.
        This includes an epoch count and optimizer state, which is serialized separately
        from  model parameters. This function should only be used to continue training -
        if you wish to load a model for inference/load parts of a model into a new
        computation graph, you should use the native Pytorch functions:
        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

        Returns
        -------
        epoch: int
            The epoch at which to resume training.
        """
        if not self._serialization_dir:
            raise ConfigurationError("serialization_dir not specified - cannot "
                                     "restore a model without a directory path.")

        serialization_files = os.listdir(self._serialization_dir)
        model_checkpoints = [x for x in serialization_files if "model_state_epoch" in x]
        epoch_to_load = max([int(x.split("model_state_epoch_")[-1].strip(".th")) for x in model_checkpoints])

        model_path = os.path.join(self._serialization_dir,
                                  "model_state_epoch_{}.th".format(epoch_to_load))
        training_state_path = os.path.join(self._serialization_dir,
                                           "training_state_epoch_{}.th".format(epoch_to_load))

        model_state = torch.load(model_path, map_location=device_mapping(self._cuda_device))
        training_state = torch.load(training_state_path)
        self._model.load_state_dict(model_state)
        self._optimizer.load_state_dict(training_state["optimizer"])
        return training_state["epoch"]
Example #9
0
    def load(self, path, model:Union[str, int]='default', vocab=None):
        model_file = None
        if isinstance(model, int):
            model_file = 'model_state_epoch_{}'.format(model)
        elif isinstance(model, str):
            if model.endswith('.th'):
                model_file = model
            elif model == 'best':
                model_file = 'best.th'
            elif model == 'last':
                model_file = sorted(glob(os.path.join(path, "model_state_epoch_*.th")), key=lambda p: os.path.getmtime(p)).pop()
            elif model == 'default':
                model_file = 'model.th'
        if model_file is None:
            raise ValueError(('`model` in `load()` must be one of the following: '
                              'an integer for epoch number, '
                              'a string name ends with ".th", '
                              'the string "best" for best model, '
                              'the string "last" for last saved model, '
                              'or the string "default" for just "model.th".'))
        vocab = vocab or os.path.join(path, 'vocab')

        if torch.cuda.is_available() and not DEBUG_TRAINING:
            device = 0
            self.model.cuda()
        else:
            device = -1

        #print('Loading vocab from {}'.format(vocab))
        self.model.vocab = Vocabulary.from_files(vocab)
        self.model.extend_embedder_vocab()
        #print('Loading model from {}'.format(model_file))
        with open(model_file, 'rb') as fin:
            self.model.load_state_dict(torch.load(fin, map_location=device_mapping(device)))
Example #10
0
    def _load(cls,
              config        ,
              serialization_dir     ,
              weights_file      = None,
              cuda_device      = -1)           :
        u"""
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, u'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get(u'model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #11
0
def load_teacher_model(teacher_path=None, device=-1):
    models = {}
    if teacher_path is not None:
        for tea_path in teacher_path.split(","):
            # teacher path is something like "Models/HotpotQA,Models/SQuAD"
            tea_name = tea_path.split("/")[-1]
            config = Params.from_file(os.path.join(tea_path, CONFIG_NAME))
            vocab_tea = Vocabulary.from_files(
                os.path.join(tea_path, "vocabulary"))
            model = Model.from_params(vocab=vocab_tea,
                                      params=config.get("model"))

            tea_model = copy.deepcopy(model)
            model_state = torch.load(
                os.path.join(tea_path, "best.th"),
                map_location=nn_util.device_mapping(cuda_device))
            tea_model.load_state_dict(model_state)
            logger.info("Load teacher model from %s", tea_path)

            # freeze the parameters of teacher model
            for p in tea_model.parameters():
                p.requires_grad = False

            if device >= 0:
                tea_model.to(device=device)
            models[tea_name] = tea_model

    return models
Example #12
0
def evaluate():
    reader = PWKPReader()
    vocab = Vocabulary.from_files(vocab_dir)
    iterator = BasicIterator(batch_size=opt.batch_size)
    iterator.index_with(vocab)

    model = Seq2Seq(emb_size=opt.emb_size,
                    hidden_size=opt.hidden_size,
                    enc_layers=opt.enc_layers,
                    dec_layers=opt.dec_layers,
                    dropout=opt.dropout,
                    bidirectional=opt.bidirectional,
                    beam_size=opt.beam_size,
                    label_smoothing=opt.label_smoothing,
                    vocab=vocab)

    model = model.cuda(opt.gpu)
    model_state = torch.load(opt.restore, map_location=util.device_mapping(-1))
    model.load_state_dict(model_state)

    predictor = Predictor(iterator=iterator,
                          max_decoding_step=opt.max_step,
                          vocab=vocab,
                          reader=reader,
                          data_path=test_path,
                          log_dir=save_dir,
                          map_path=ner_path,
                          cuda_device=opt.gpu)

    predictor.evaluate(model)
Example #13
0
def restore_checkpoint_flexible(model, fn):
    model_state = torch.load(fn, map_location=device_mapping(-1))
    assert os.path.exists(fn)
    if isinstance(model, DataParallel):
        load_state_dict_flexible(model.module, model_state)
    else:
        load_state_dict_flexible(model, model_state)
Example #14
0
def load_model_state(model,
                     state_path,
                     gpu_id,
                     skip_task_models=[],
                     strict=True):
    """ Helper function to load a model state

    Parameters
    ----------
    model: The model object to populate with loaded parameters.
    state_path: The path to a model_state checkpoint.
    gpu_id: The GPU to use. -1 for no GPU.
    skip_task_models: If set, skip task-specific parameters for these tasks.
        This does not necessarily skip loading ELMo scalar weights, but I (Sam) sincerely
        doubt that this matters.
    strict: Whether we should fail if any parameters aren't found in the checkpoint. If false,
        there is a risk of leaving some parameters in their randomly initialized state.
    """
    model_state = torch.load(state_path, map_location=device_mapping(gpu_id))

    assert_for_log(
        not (skip_task_models and strict),
        "Can't skip task models while also strictly loading task models. Something is wrong.",
    )

    for name, param in model.named_parameters():
        # Make sure no trainable params are missing.
        if param.requires_grad:
            if strict:
                assert_for_log(
                    name in model_state,
                    "In strict mode and failed to find at least one parameter: "
                    + name,
                )
            elif (name not in model_state) and ((not skip_task_models) or
                                                ("_mdl" not in name)):
                logging.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                logging.error("Parameter missing from checkpoint: " + name)
                logging.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    if skip_task_models:
        keys_to_skip = []
        for task in skip_task_models:
            new_keys_to_skip = [
                key for key in model_state if "%s_mdl" % task in key
            ]
            if new_keys_to_skip:
                logging.info(
                    "Not loading task-specific parameters for task: %s" % task)
                keys_to_skip += new_keys_to_skip
            else:
                logging.info(
                    "Found no task-specific parameters to skip for task: %s" %
                    task)
        for key in keys_to_skip:
            del model_state[key]

    model.load_state_dict(model_state, strict=False)
    logging.info("Loaded model state from %s", state_path)
Example #15
0
    def load(cls,
             config: Params,
             serialization_prefix: str = None,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_prefix: str = None
            By default we look at `config['trainer']['serialization_prefix']` to
            get the path to the serialized model, but you can override that
            value here.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        trainer_config = config.get("trainer", {})
        serialization_prefix = (serialization_prefix
                                or trainer_config.get('serialization_prefix'))
        if serialization_prefix is None:
            raise ConfigurationError('serialization_prefix must be specified')

        weights_file = weights_file or os.path.join(serialization_prefix,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_prefix, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model = Model.from_params(vocab, config.get('model'))
        model_state = torch.load(weights_file,
                                 map_location=device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #16
0
    def load(cls,
             config: Params,
             serialization_dir: str,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_dir: str = None
            The directory containing the serialized weights, parameters, and vocabulary
            of the model.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        weights_file = weights_file or os.path.join(serialization_dir,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        _remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab, model_params)
        model_state = torch.load(weights_file,
                                 map_location=device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #17
0
def restore_best_checkpoint(model, serialization_dir):
    fn = os.path.join(serialization_dir, 'best.th')
    model_state = torch.load(fn, map_location=device_mapping(-1))
    assert os.path.exists(fn)
    if isinstance(model, DataParallel):
        model.module.load_state_dict(model_state)
    else:
        model.load_state_dict(model_state)
Example #18
0
    def __init__(self,
                 vocab: Vocabulary,
                 soldered_kgs: Dict[str, Model],
                 soldered_layers: Dict[str, int],
                 bert_model_name: str,
                 mode: str = None,
                 model_archive: str = None,
                 strict_load_archive: bool = True,
                 debug_cuda: bool = False,
                 remap_segment_embeddings: int = None,
                 regularizer: RegularizerApplicator = None):

        super().__init__(vocab, regularizer)

        self.remap_segment_embeddings = remap_segment_embeddings

        # get the LM + NSP parameters from BERT
        pretrained_bert = BertForPreTraining.from_pretrained(bert_model_name)
        self.pretrained_bert = pretrained_bert
        self.pretraining_heads = pretrained_bert.cls
        self.pooler = pretrained_bert.bert.pooler

        # the soldered kgs
        self.soldered_kgs = soldered_kgs
        for key, skg in soldered_kgs.items():
            self.add_module(key + "_soldered_kg", skg)

        # list of (layer_number, soldered key) sorted in ascending order
        self.layer_to_soldered_kg = sorted(
            [(layer, key) for key, layer in soldered_layers.items()]
        )
        # the last layer
        num_bert_layers = len(self.pretrained_bert.bert.encoder.layer)
        # the first element of the list is the index
        self.layer_to_soldered_kg.append([num_bert_layers - 1, None])

        if model_archive is not None:
            with tarfile.open(cached_path(model_archive), 'r:gz') as fin:
                # a file object
                weights_file = fin.extractfile('weights.th')
                state_dict = torch.load(weights_file, map_location=device_mapping(-1))
            self.load_state_dict(state_dict, strict=strict_load_archive)

        if remap_segment_embeddings is not None:
            # will redefine the segment embeddings
            new_embeddings = self._remap_embeddings(self.pretrained_bert.bert.embeddings.token_type_embeddings.weight)
            if new_embeddings is not None:
                del self.pretrained_bert.bert.embeddings.token_type_embeddings
                self.pretrained_bert.bert.embeddings.token_type_embeddings = new_embeddings

        assert mode in (None, 'entity_linking')
        self.mode = mode
        self.unfreeze()

        if debug_cuda:
            for m in self.modules():
                m.register_forward_hook(diagnose_forward_hook)
                m.register_backward_hook(diagnose_backward_hook)
Example #19
0
def evaluate():
    if opt.part == 'table2pivot':
        corpus = Table2PivotCorpus(vocab_size=opt.vocab_size, 
                                    max_len=opt.src_max_len, 
                                    batch_size=opt.batch_size,
                                    log_dir=opt.dir,
                                    scale=opt.scale,
                                    mode=opt.mode)
    else:
        corpus = Pivot2TextCorpus(vocab_size=opt.vocab_size, 
                                    src_max_len=opt.src_max_len, 
                                    tgt_max_len=opt.tgt_max_len, 
                                    batch_size=opt.batch_size,
                                    share=opt.share,
                                    log_dir=opt.dir,
                                    scale=opt.scale,
                                    append_rate=opt.append_rate,
                                    drop_rate=opt.drop_rate,
                                    blank_rate=opt.blank_rate,
                                    setting=opt.setting,
                                    mode=opt.mode,
                                    use_feature=opt.feature)

    model = Pivot(emb_size=opt.emb_size,
                    key_emb_size=opt.key_emb_size,
                    pos_emb_size=opt.pos_emb_size,
                    hidden_size=opt.hidden_size,
                    n_hidden=opt.n_hidden,
                    n_block=opt.n_block,
                    ff_size=opt.ff_size,
                    n_head=opt.n_head,
                    enc_layers=opt.enc_layers,
                    dec_layers=opt.dec_layers,
                    dropout=opt.dropout,
                    bidirectional=opt.bidirectional,
                    beam_size=opt.beam_size,
                    max_decoding_step=opt.max_step,
                    minimum_length=opt.minimum_length,
                    label_smoothing=opt.label_smoothing,
                    share=opt.share,
                    part=opt.part,
                    vocab=corpus.vocab,
                    use_feature=opt.feature,
                    arch=opt.arch)
    
    if opt.fp16:
        model.half()

    model = model.cuda(opt.gpu)
    model_state = torch.load(opt.restore, map_location=util.device_mapping(-1))
    model.load_state_dict(model_state)

    predictor = Predictor(dataset=corpus.test_dataset,
                          dataloader=corpus.test_loader,
                          corpus=corpus,
                          cuda_device=opt.gpu)
    
    predictor.evaluate(model)
Example #20
0
    def load(cls,
             config: Params,
             serialization_dir: str,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_dir: str = None
            The directory containing the serialized weights, parameters, and vocabulary
            of the model.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        _remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab, model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #21
0
    def _load(
        cls,
        config: Params,
        serialization_dir: Union[str, PathLike],
        weights_file: Optional[Union[str, PathLike]] = None,
        cuda_device: int = -1,
    ) -> "Model":
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab_params = config.get("vocabulary", Params({}))
        vocab_choice = vocab_params.pop_choice("type",
                                               Vocabulary.list_available(),
                                               True)
        vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice)
        vocab = vocab_class.from_files(vocab_dir,
                                       vocab_params.get("padding_token"),
                                       vocab_params.get("oov_token"))

        model_params = config.get("model")

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        # If vocab+embedding extension was done, the model initialized from from_params
        # and one defined by state dict in weights_file might not have same embedding shapes.
        # Eg. when model embedder module was transferred along with vocab extension, the
        # initialized embedding weight shape would be smaller than one in the state_dict.
        # So calling model embedding extension is required before load_state_dict.
        # If vocab and model embeddings are in sync, following would be just a no-op.
        model.extend_embedder_vocab()

        model_state = torch.load(weights_file,
                                 map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        return model
Example #22
0
 def best_model_state(self) -> Dict[str, Any]:
     if self._serialization_dir:
         logger.info("loading best weights")
         best_model_state_path = os.path.join(self._serialization_dir, "best.th")
         return torch.load(best_model_state_path, map_location=nn_util.device_mapping(-1))
     else:
         logger.info(
             "cannot load best weights without `serialization_dir`, "
             "so you're just getting the last weights"
         )
         return {}
Example #23
0
 def _load_weights(self):
     try:
         # TODO(change)
         # device = get_device_of(next(self.parameters()))
         device = -1
         model_state = torch.load(self._weight_file, map_location=device_mapping(device))
     except Exception:
         model_state = None
     self._load_char_embedding(model_state)
     self._load_cnn_weights(model_state)
     self._load_highway(model_state)
     self._load_projection(model_state)
Example #24
0
def restore_model_state(model, model_path):
    assert os.path.exists(model_path)

    # Load the parameters onto CPU, then transfer to GPU.
    # This avoids potential OOM on GPU for large models that
    # load parameters onto GPU then make a new GPU copy into the parameter
    # buffer. The GPU transfer happens implicitly in load_state_dict.
    model_state = torch.load(model_path, map_location=device_mapping(-1))
    if isinstance(model, (DataParallel, DistributedDataParallel)):
        model.module.load_state_dict(model_state)
    else:
        model.load_state_dict(model_state)
Example #25
0
def load_encoder(encoder, ckpt_path):
    from src import device, cuda_device

    checkpoint = torch.load(ckpt_path,
                            map_location=device_mapping(cuda_device))
    model_state_dict = checkpoint["model"]
    encoder_state_dict = get_encoder_state_dict(model_state_dict, "encoder")
    encoder.load_state_dict(encoder_state_dict, strict=False)

    print(encoder_state_dict.keys())
    log.info(f"Load pretrained model: {ckpt_path}")
    perplexity = re.findall(".*best(.*).pt", ckpt_path)[0]
    log.info("pretrain perplexity: %.2f", float(perplexity))
Example #26
0
 def test(self, data):
     """
         Computes the metrics for data
     """
     model_path = os.path.join(self._serialization_dir, "best.th")
     logger.info("Loading best model from {0}".format(model_path))
     model_state = torch.load(model_path,
                              map_location=util.device_mapping(-1))
     self._model.load_state_dict(model_state)
     loss, num_batches = self._inference_loss(data, logger_string="Testing")
     metrics = self._get_metrics(loss, num_batches, reset=True)
     metric_names = metrics.keys()
     message_template = "%s %s : %3f"
     for name in metric_names:
         test_metric = metrics.get(name)
         logger.info(message_template, "Test", name, test_metric)
     return metrics
def load_checkpoint(
    model, generator, n_tr_batches, checkpoint_path, metric, optimizers
):
    if not isinstance(optimizers, list):
        optimizers = [optimizers]
    checkpoint = torch.load(checkpoint_path, map_location=device_mapping(cuda_device))
    model.load_state_dict(checkpoint["model"], strict=False)
    step = checkpoint["step"]
    if metric is not None:
        metric.load_state_dict(checkpoint["metric"])

    for i, opt in enumerate(optimizers):
        opt.load_state_dict(checkpoint["optimizer" + str(i)])
    for _ in itertools.islice(generator, step % n_tr_batches):
        pass
    log.info(f"Load checkpoint: {checkpoint_path}")
    # return model, generator, step, metric, optimizers
    return step
    def restore_checkpoint_pretrained(self, restore_bin: str):
        # Restore from a given model path
        state_dict = torch.load(restore_bin, map_location=device_mapping(-1))
        if isinstance(self.model, DataParallel):
            model_to_load = self.model.module
        else:
            model_to_load = self.model

        own_state = model_to_load.state_dict()
        for name, param in state_dict.items():
            if name not in own_state:
                print("Skipped:" + name)
                continue
            if isinstance(param, torch.nn.Parameter):
                # backwards compatibility for serialized parameters
                param = param.data
            try:
                own_state[name].copy_(param)
            except:
                print("Part load failed: " + name)
Example #29
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab_params = config.get("vocabulary", Params({}))
        vocab_choice = vocab_params.pop_choice("type",
                                               Vocabulary.list_available(),
                                               True)
        vocab = Vocabulary.by_name(vocab_choice).from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file,
                                 map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #30
0
    def _restore_checkpoint(self) -> Tuple[int, List[float]]:
        latest_checkpoint = self.find_latest_checkpoint()

        if latest_checkpoint is None:
            return 0, []

        model_path, training_state_path = latest_checkpoint
        training_state = torch.load(training_state_path,
                                    map_location=util.device_mapping(-1))

        if isinstance(training_state["epoch"], int):
            epoch_to_return = training_state["epoch"] + 1
        else:
            epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1

        if "val_metric_per_epoch" not in training_state:
            logger.warning(
                "trainer state `val_metric_per_epoch` not found, using empty list"
            )
            val_metric_per_epoch: List[float] = []
        else:
            val_metric_per_epoch = training_state["val_metric_per_epoch"]

        return epoch_to_return, val_metric_per_epoch
Example #31
0
    def _load(
        cls,
        config: Params,
        serialization_dir: str,
        weights_file: Optional[str] = None,
        cuda_device: int = -1,
        opt_level: Optional[str] = None,
    ) -> "Model":
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab_params = config.get("vocabulary", Params({}))
        vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True)
        vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice)
        vocab = vocab_class.from_files(
            vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token")
        )

        model_params = config.get("model")

        training_params = config.get("trainer", Params({}))
        opt_level = opt_level or training_params.get("opt_level")

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        # If opt_level is not None (i.e. it exists in the loaded models params or was provided
        # as argument to this method), call amp.initialize on the loaded model.
        # Log a warning if amp is not installed or we are loading onto the cpu so that these
        # cases do not pass silently.
        if opt_level is not None:
            if amp is None:
                logger.warning(
                    (
                        f"Apex must be installed to enable mixed-precision via amp."
                        f" Got opt_level is not None (opt_level={opt_level}) but Apex is not installed."
                        " Any further training or inference will happen at full-precision."
                    )
                )
            if cuda_device == -1:
                logger.warning(
                    (
                        f"A CUDA device must be specified to enable mixed-precision via amp."
                        f" Got cuda_device=={cuda_device} but opt_level is not None (opt_level={opt_level})."
                        " Any further training or inference will happen at full-precision."
                    )
                )
            if amp is not None and cuda_device >= 0:
                model = amp.initialize(model, opt_level=opt_level)

        # If vocab+embedding extension was done, the model initialized from from_params
        # and one defined by state dict in weights_file might not have same embedding shapes.
        # Eg. when model embedder module was transferred along with vocab extension, the
        # initialized embedding weight shape would be smaller than one in the state_dict.
        # So calling model embedding extension is required before load_state_dict.
        # If vocab and model embeddings are in sync, following would be just a no-op.
        model.extend_embedder_vocab()

        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        return model
Example #32
0
    def _restore_checkpoint(self) -> Tuple[int, List[float]]:
        """
        Restores a model from a serialization_dir to the last saved checkpoint.
        This includes an epoch count and optimizer state, which is serialized separately
        from  model parameters. This function should only be used to continue training -
        if you wish to load a model for inference/load parts of a model into a new
        computation graph, you should use the native Pytorch functions:
        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

        If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights,
        this function will do nothing and return 0.

        Returns
        -------
        epoch: int
            The epoch at which to resume training, which should be one after the epoch
            in the saved training state.
        """
        have_checkpoint = (self._serialization_dir is not None and
                           any("model_state_epoch_" in x for x in os.listdir(self._serialization_dir)))

        if not have_checkpoint:
            # No checkpoint to restore, start at 0
            return 0, []

        serialization_files = os.listdir(self._serialization_dir)
        model_checkpoints = [x for x in serialization_files if "model_state_epoch" in x]
        # Get the last checkpoint file.  Epochs are specified as either an
        # int (for end of epoch files) or with epoch and timestamp for
        # within epoch checkpoints, e.g. 5.2018-02-02-15-33-42
        found_epochs = [
                # pylint: disable=anomalous-backslash-in-string
                re.search("model_state_epoch_([0-9\.\-]+)\.th", x).group(1)
                for x in model_checkpoints
        ]
        int_epochs: Any = []
        for epoch in found_epochs:
            pieces = epoch.split('.')
            if len(pieces) == 1:
                # Just a single epoch without timestamp
                int_epochs.append([int(pieces[0]), 0])
            else:
                # has a timestamp
                int_epochs.append([int(pieces[0]), pieces[1]])
        last_epoch = sorted(int_epochs, reverse=True)[0]
        if last_epoch[1] == 0:
            epoch_to_load = str(last_epoch[0])
        else:
            epoch_to_load = '{0}.{1}'.format(last_epoch[0], last_epoch[1])

        model_path = os.path.join(self._serialization_dir,
                                  "model_state_epoch_{}.th".format(epoch_to_load))
        training_state_path = os.path.join(self._serialization_dir,
                                           "training_state_epoch_{}.th".format(epoch_to_load))

        # Load the parameters onto CPU, then transfer to GPU.
        # This avoids potential OOM on GPU for large models that
        # load parameters onto GPU then make a new GPU copy into the parameter
        # buffer. The GPU transfer happens implicitly in load_state_dict.
        model_state = torch.load(model_path, map_location=util.device_mapping(-1))
        training_state = torch.load(training_state_path, map_location=util.device_mapping(-1))
        self._model.load_state_dict(model_state)
        self._optimizer.load_state_dict(training_state["optimizer"])

        # We didn't used to save `validation_metric_per_epoch`, so we can't assume
        # that it's part of the trainer state. If it's not there, an empty list is all
        # we can do.
        if "val_metric_per_epoch" not in training_state:
            logger.warning("trainer state `val_metric_per_epoch` not found, using empty list")
            val_metric_per_epoch: List[float] = []
        else:
            val_metric_per_epoch = training_state["val_metric_per_epoch"]

        if isinstance(training_state["epoch"], int):
            epoch_to_return = training_state["epoch"] + 1
        else:
            epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1

        # For older checkpoints with batch_num_total missing, default to old behavior where
        # it is unchanged.
        batch_num_total = training_state.get('batch_num_total')
        if batch_num_total is not None:
            self._batch_num_total = batch_num_total

        return epoch_to_return, val_metric_per_epoch
Example #33
0
    def _load(
        cls,
        config: Params,
        serialization_dir: Union[str, PathLike],
        weights_file: Optional[Union[str, PathLike]] = None,
        cuda_device: int = -1,
    ) -> "Model":
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab_params = config.get("vocabulary", Params({}))
        vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True)
        vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice)
        vocab = vocab_class.from_files(
            vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token")
        )

        model_params = config.get("model")

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings/weights from. We're now _loading_ the model, so those weights will already be
        # stored in our model. We don't need any pretrained weight file or initializers anymore,
        # and we don't want the code to look for it, so we remove it from the parameters here.
        remove_keys_from_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        # If vocab+embedding extension was done, the model initialized from from_params
        # and one defined by state dict in weights_file might not have same embedding shapes.
        # Eg. when model embedder module was transferred along with vocab extension, the
        # initialized embedding weight shape would be smaller than one in the state_dict.
        # So calling model embedding extension is required before load_state_dict.
        # If vocab and model embeddings are in sync, following would be just a no-op.
        model.extend_embedder_vocab()

        # Load state dict. We pass `strict=False` so PyTorch doesn't raise a RuntimeError
        # if the state dict is missing keys because we handle this case below.
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        missing_keys, unexpected_keys = model.load_state_dict(model_state, strict=False)

        # Modules might define a class variable called `authorized_missing_keys`,
        # a list of regex patterns, that tells us to ignore missing keys that match
        # any of the patterns.
        # We sometimes need this in order to load older models with newer versions of AllenNLP.

        def filter_out_authorized_missing_keys(module, prefix=""):
            nonlocal missing_keys
            for pat in getattr(module.__class__, "authorized_missing_keys", None) or []:
                missing_keys = [
                    k
                    for k in missing_keys
                    if k.startswith(prefix) and re.search(pat[len(prefix) :], k) is None
                ]
            for name, child in module._modules.items():
                if child is not None:
                    filter_out_authorized_missing_keys(child, prefix + name + ".")

        filter_out_authorized_missing_keys(model)

        if unexpected_keys or missing_keys:
            raise RuntimeError(
                f"Error loading state dict for {model.__class__.__name__}\n\t"
                f"Missing keys: {missing_keys}\n\t"
                f"Unexpected keys: {unexpected_keys}"
            )

        return model
Example #34
0
    def from_params(cls,
                    params: Params,
                    serialization_dir: str,
                    recover: bool = False,
                    cache_directory: str = None,
                    cache_prefix: str = None) -> 'PtDistTrainer':
        all_datasets = training_util.datasets_from_params(
            params, cache_directory, cache_prefix)
        vocab = Vocabulary.from_files(params.vocabulary.directory_path)

        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        model.extend_embedder_vocab()
        if is_master_rank():
            vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        train_data = all_datasets['train']
        validation_data = all_datasets.get('validation')

        batch_size = params.iterator.batch_size

        trainer_params = params.pop("trainer")
        keys = [key for key in params]
        for key in keys:
            params.pop(key)
        params = trainer_params
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        pretrain_file = params.pop("pretrain_file", None)

        no_grad_regexes = params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = \
            get_frozen_and_tunable_parameter_names(model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        model = model.cuda(dist.get_rank())
        if pretrain_file:
            model_state = torch.load(pretrain_file,
                                     map_location=nn_util.device_mapping(
                                         dist.get_rank()))
            model.load_state_dict(model_state)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        # print([n for n, p in model.named_parameters() if p.requires_grad])
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None

        num_serialized_models_to_keep = params.pop_int(
            "num_serialized_models_to_keep", 20)
        checkpointer = Checkpointer(
            serialization_dir=serialization_dir,
            num_serialized_models_to_keep=num_serialized_models_to_keep,
            keep_serialized_model_every_num_seconds=None)

        return cls(model,
                   optimizer,
                   train_data,
                   validation_data,
                   batch_size=batch_size,
                   validation_metric=validation_metric,
                   shuffle=shuffle,
                   num_epochs=num_epochs,
                   serialization_dir=serialization_dir,
                   cuda_device=cuda_device,
                   grad_clipping=grad_clipping,
                   learning_rate_scheduler=lr_scheduler,
                   checkpointer=checkpointer)