def load_checkpoint(self) -> Optional[TrainerCheckpoint]: """ Loads model state from a `serialization_dir` corresponding to the last saved checkpoint. This includes a training state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `model.load_state_dict(torch.load("/path/to/model/weights.th"))` If `self._serialization_dir` does not exist or does not contain any checkpointed weights, this function will do nothing and return empty dicts. # Returns states : `Tuple[Dict[str, Any], Dict[str, Any]]` The model state and the training state. """ latest_checkpoint = self.find_latest_checkpoint() if latest_checkpoint is None: return None model_path, training_state_path = latest_checkpoint # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=nn_util.device_mapping(-1)) return TrainerCheckpoint(model_state, training_state)
def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ Restores a model from a serialization_dir to the last saved checkpoint. This includes a training state (typically consisting of an epoch count and optimizer state), which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights, this function will do nothing and return empty dicts. Returns ------- states: Tuple[Dict[str, Any], Dict[str, Any]] The model state and the training state. """ latest_checkpoint = self.find_latest_checkpoint() if latest_checkpoint is None: # No checkpoint to restore, start at 0 return {}, {} model_path, training_state_path = latest_checkpoint # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=nn_util.device_mapping(-1)) return model_state, training_state
def restore_checkpoint(model, optimizer, serialization_dir, learning_rate_scheduler=None): """ Restores a model from a serialization_dir to the last saved checkpoint. This includes an epoch count and optimizer state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights, this function will do nothing and return 0. Returns ------- epoch: int The epoch at which to resume training, which should be one after the epoch in the saved training state. """ latest_checkpoint = find_latest_checkpoint(serialization_dir) if latest_checkpoint is None: # No checkpoint to restore, start at 0 return 0, [] model_path, training_state_path = latest_checkpoint # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=device_mapping(-1)) training_state = torch.load(training_state_path, map_location=device_mapping(-1)) if isinstance(model, DataParallel): model.module.load_state_dict(model_state) else: model.load_state_dict(model_state) # idk this is always bad luck for me optimizer.load_state_dict(training_state["optimizer"]) if learning_rate_scheduler is not None and "learning_rate_scheduler" in training_state: learning_rate_scheduler.lr_scheduler.load_state_dict( training_state["learning_rate_scheduler"]) move_optimizer_to_cuda(optimizer) # We didn't used to save `validation_metric_per_epoch`, so we can't assume # that it's part of the trainer state. If it's not there, an empty list is all # we can do. if "val_metric_per_epoch" not in training_state: print("trainer state `val_metric_per_epoch` not found, using empty list") val_metric_per_epoch: [] else: val_metric_per_epoch = training_state["val_metric_per_epoch"] if isinstance(training_state["epoch"], int): epoch_to_return = training_state["epoch"] + 1 else: epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1 return epoch_to_return, val_metric_per_epoch
def _restore_checkpoint(self) -> Tuple[int, List[float]]: """ Restores a model from a serialization_dir to the last saved checkpoint. This includes an epoch count and optimizer state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights, this function will do nothing and return 0. Returns ------- epoch: int The epoch at which to resume training, which should be one after the epoch in the saved training state. """ latest_checkpoint = self.find_latest_checkpoint() if latest_checkpoint is None: # No checkpoint to restore, start at 0 return 0, [] model_path, training_state_path = latest_checkpoint # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=util.device_mapping(-1)) self._model.load_state_dict(model_state) self._optimizer.load_state_dict(training_state["optimizer"]) move_optimizer_to_cuda(self._optimizer) # We didn't used to save `validation_metric_per_epoch`, so we can't assume # that it's part of the trainer state. If it's not there, an empty list is all # we can do. if "val_metric_per_epoch" not in training_state: logger.warning("trainer state `val_metric_per_epoch` not found, using empty list") val_metric_per_epoch: List[float] = [] else: val_metric_per_epoch = training_state["val_metric_per_epoch"] if isinstance(training_state["epoch"], int): epoch_to_return = training_state["epoch"] + 1 else: epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1 # For older checkpoints with batch_num_total missing, default to old behavior where # it is unchanged. batch_num_total = training_state.get('batch_num_total') if batch_num_total is not None: self._batch_num_total = batch_num_total return epoch_to_return, val_metric_per_epoch
def _restore_checkpoint(self) -> Tuple[int, List[float]]: """ Restores a model from a serialization_dir to the last saved checkpoint. This includes an epoch count and optimizer state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights, this function will do nothing and return 0. Returns ------- epoch: int The epoch at which to resume training, which should be one after the epoch in the saved training state. """ latest_checkpoint = self.find_latest_checkpoint() if latest_checkpoint is None: # No checkpoint to restore, start at 0 return 0, [] model_path, training_state_path = latest_checkpoint # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=util.device_mapping(-1)) self._model.load_state_dict(model_state) self._optimizer.load_state_dict(training_state["optimizer"]) move_optimizer_to_cuda(self._optimizer) # We didn't used to save `validation_metric_per_epoch`, so we can't assume # that it's part of the trainer state. If it's not there, an empty list is all # we can do. if "val_metric_per_epoch" not in training_state: logger.warning("trainer state `val_metric_per_epoch` not found, using empty list") val_metric_per_epoch: List[float] = [] else: val_metric_per_epoch = training_state["val_metric_per_epoch"] if isinstance(training_state["epoch"], int): epoch_to_return = training_state["epoch"] + 1 else: epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1 # For older checkpoints with batch_num_total missing, default to old behavior where # it is unchanged. batch_num_total = training_state.get('batch_num_total') if batch_num_total is not None: self._batch_num_total = batch_num_total return epoch_to_return, val_metric_per_epoch
def _restore_checkpoint(self) -> Tuple[int, List[float]]: """ Restores a model from a serialization_dir to the last saved checkpoint. This includes an epoch count and optimizer state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights, this function will do nothing and return 0. Returns ------- epoch: int The epoch at which to resume training, which should be one after the epoch in the saved training state. """ have_checkpoint = (self._serialization_dir is not None and any("model_state_epoch_" in x for x in os.listdir(self._serialization_dir))) if not have_checkpoint: # No checkpoint to restore, start at 0 return 0, [] serialization_files = os.listdir(self._serialization_dir) model_checkpoints = [x for x in serialization_files if "model_state_epoch" in x] epoch_to_load = max([int(x.split("model_state_epoch_")[-1].strip(".th")) for x in model_checkpoints]) model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}.th".format(epoch_to_load)) training_state_path = os.path.join(self._serialization_dir, "training_state_epoch_{}.th".format(epoch_to_load)) model_state = torch.load(model_path, map_location=util.device_mapping(self._cuda_device)) training_state = torch.load(training_state_path, map_location=util.device_mapping(self._cuda_device)) self._model.load_state_dict(model_state) self._optimizer.load_state_dict(training_state["optimizer"]) # We didn't used to save `validation_metric_per_epoch`, so we can't assume # that it's part of the trainer state. If it's not there, an empty list is all # we can do. if "val_metric_per_epoch" not in training_state: logger.warning("trainer state `val_metric_per_epoch` not found, using empty list") val_metric_per_epoch: List[float] = [] else: val_metric_per_epoch = training_state["val_metric_per_epoch"] return training_state["epoch"] + 1, val_metric_per_epoch
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _restore_checkpoint(self) -> int: """ Restores a model from a serialization_dir to the last saved checkpoint. This includes an epoch count and optimizer state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` Returns ------- epoch: int The epoch at which to resume training. """ if not self._serialization_dir: raise ConfigurationError("serialization_dir not specified - cannot " "restore a model without a directory path.") serialization_files = os.listdir(self._serialization_dir) model_checkpoints = [x for x in serialization_files if "model_state_epoch" in x] epoch_to_load = max([int(x.split("model_state_epoch_")[-1].strip(".th")) for x in model_checkpoints]) model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}.th".format(epoch_to_load)) training_state_path = os.path.join(self._serialization_dir, "training_state_epoch_{}.th".format(epoch_to_load)) model_state = torch.load(model_path, map_location=device_mapping(self._cuda_device)) training_state = torch.load(training_state_path) self._model.load_state_dict(model_state) self._optimizer.load_state_dict(training_state["optimizer"]) return training_state["epoch"]
def load(self, path, model:Union[str, int]='default', vocab=None): model_file = None if isinstance(model, int): model_file = 'model_state_epoch_{}'.format(model) elif isinstance(model, str): if model.endswith('.th'): model_file = model elif model == 'best': model_file = 'best.th' elif model == 'last': model_file = sorted(glob(os.path.join(path, "model_state_epoch_*.th")), key=lambda p: os.path.getmtime(p)).pop() elif model == 'default': model_file = 'model.th' if model_file is None: raise ValueError(('`model` in `load()` must be one of the following: ' 'an integer for epoch number, ' 'a string name ends with ".th", ' 'the string "best" for best model, ' 'the string "last" for last saved model, ' 'or the string "default" for just "model.th".')) vocab = vocab or os.path.join(path, 'vocab') if torch.cuda.is_available() and not DEBUG_TRAINING: device = 0 self.model.cuda() else: device = -1 #print('Loading vocab from {}'.format(vocab)) self.model.vocab = Vocabulary.from_files(vocab) self.model.extend_embedder_vocab() #print('Loading model from {}'.format(model_file)) with open(model_file, 'rb') as fin: self.model.load_state_dict(torch.load(fin, map_location=device_mapping(device)))
def _load(cls, config , serialization_dir , weights_file = None, cuda_device = -1) : u""" Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, u'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get(u'model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def load_teacher_model(teacher_path=None, device=-1): models = {} if teacher_path is not None: for tea_path in teacher_path.split(","): # teacher path is something like "Models/HotpotQA,Models/SQuAD" tea_name = tea_path.split("/")[-1] config = Params.from_file(os.path.join(tea_path, CONFIG_NAME)) vocab_tea = Vocabulary.from_files( os.path.join(tea_path, "vocabulary")) model = Model.from_params(vocab=vocab_tea, params=config.get("model")) tea_model = copy.deepcopy(model) model_state = torch.load( os.path.join(tea_path, "best.th"), map_location=nn_util.device_mapping(cuda_device)) tea_model.load_state_dict(model_state) logger.info("Load teacher model from %s", tea_path) # freeze the parameters of teacher model for p in tea_model.parameters(): p.requires_grad = False if device >= 0: tea_model.to(device=device) models[tea_name] = tea_model return models
def evaluate(): reader = PWKPReader() vocab = Vocabulary.from_files(vocab_dir) iterator = BasicIterator(batch_size=opt.batch_size) iterator.index_with(vocab) model = Seq2Seq(emb_size=opt.emb_size, hidden_size=opt.hidden_size, enc_layers=opt.enc_layers, dec_layers=opt.dec_layers, dropout=opt.dropout, bidirectional=opt.bidirectional, beam_size=opt.beam_size, label_smoothing=opt.label_smoothing, vocab=vocab) model = model.cuda(opt.gpu) model_state = torch.load(opt.restore, map_location=util.device_mapping(-1)) model.load_state_dict(model_state) predictor = Predictor(iterator=iterator, max_decoding_step=opt.max_step, vocab=vocab, reader=reader, data_path=test_path, log_dir=save_dir, map_path=ner_path, cuda_device=opt.gpu) predictor.evaluate(model)
def restore_checkpoint_flexible(model, fn): model_state = torch.load(fn, map_location=device_mapping(-1)) assert os.path.exists(fn) if isinstance(model, DataParallel): load_state_dict_flexible(model.module, model_state) else: load_state_dict_flexible(model, model_state)
def load_model_state(model, state_path, gpu_id, skip_task_models=[], strict=True): """ Helper function to load a model state Parameters ---------- model: The model object to populate with loaded parameters. state_path: The path to a model_state checkpoint. gpu_id: The GPU to use. -1 for no GPU. skip_task_models: If set, skip task-specific parameters for these tasks. This does not necessarily skip loading ELMo scalar weights, but I (Sam) sincerely doubt that this matters. strict: Whether we should fail if any parameters aren't found in the checkpoint. If false, there is a risk of leaving some parameters in their randomly initialized state. """ model_state = torch.load(state_path, map_location=device_mapping(gpu_id)) assert_for_log( not (skip_task_models and strict), "Can't skip task models while also strictly loading task models. Something is wrong.", ) for name, param in model.named_parameters(): # Make sure no trainable params are missing. if param.requires_grad: if strict: assert_for_log( name in model_state, "In strict mode and failed to find at least one parameter: " + name, ) elif (name not in model_state) and ((not skip_task_models) or ("_mdl" not in name)): logging.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") logging.error("Parameter missing from checkpoint: " + name) logging.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if skip_task_models: keys_to_skip = [] for task in skip_task_models: new_keys_to_skip = [ key for key in model_state if "%s_mdl" % task in key ] if new_keys_to_skip: logging.info( "Not loading task-specific parameters for task: %s" % task) keys_to_skip += new_keys_to_skip else: logging.info( "Found no task-specific parameters to skip for task: %s" % task) for key in keys_to_skip: del model_state[key] model.load_state_dict(model_state, strict=False) logging.info("Loaded model state from %s", state_path)
def load(cls, config: Params, serialization_prefix: str = None, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_prefix: str = None By default we look at `config['trainer']['serialization_prefix']` to get the path to the serialized model, but you can override that value here. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ trainer_config = config.get("trainer", {}) serialization_prefix = (serialization_prefix or trainer_config.get('serialization_prefix')) if serialization_prefix is None: raise ConfigurationError('serialization_prefix must be specified') weights_file = weights_file or os.path.join(serialization_prefix, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_prefix, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model = Model.from_params(vocab, config.get('model')) model_state = torch.load(weights_file, map_location=device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_dir: str = None The directory containing the serialized weights, parameters, and vocabulary of the model. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. _remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab, model_params) model_state = torch.load(weights_file, map_location=device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def restore_best_checkpoint(model, serialization_dir): fn = os.path.join(serialization_dir, 'best.th') model_state = torch.load(fn, map_location=device_mapping(-1)) assert os.path.exists(fn) if isinstance(model, DataParallel): model.module.load_state_dict(model_state) else: model.load_state_dict(model_state)
def __init__(self, vocab: Vocabulary, soldered_kgs: Dict[str, Model], soldered_layers: Dict[str, int], bert_model_name: str, mode: str = None, model_archive: str = None, strict_load_archive: bool = True, debug_cuda: bool = False, remap_segment_embeddings: int = None, regularizer: RegularizerApplicator = None): super().__init__(vocab, regularizer) self.remap_segment_embeddings = remap_segment_embeddings # get the LM + NSP parameters from BERT pretrained_bert = BertForPreTraining.from_pretrained(bert_model_name) self.pretrained_bert = pretrained_bert self.pretraining_heads = pretrained_bert.cls self.pooler = pretrained_bert.bert.pooler # the soldered kgs self.soldered_kgs = soldered_kgs for key, skg in soldered_kgs.items(): self.add_module(key + "_soldered_kg", skg) # list of (layer_number, soldered key) sorted in ascending order self.layer_to_soldered_kg = sorted( [(layer, key) for key, layer in soldered_layers.items()] ) # the last layer num_bert_layers = len(self.pretrained_bert.bert.encoder.layer) # the first element of the list is the index self.layer_to_soldered_kg.append([num_bert_layers - 1, None]) if model_archive is not None: with tarfile.open(cached_path(model_archive), 'r:gz') as fin: # a file object weights_file = fin.extractfile('weights.th') state_dict = torch.load(weights_file, map_location=device_mapping(-1)) self.load_state_dict(state_dict, strict=strict_load_archive) if remap_segment_embeddings is not None: # will redefine the segment embeddings new_embeddings = self._remap_embeddings(self.pretrained_bert.bert.embeddings.token_type_embeddings.weight) if new_embeddings is not None: del self.pretrained_bert.bert.embeddings.token_type_embeddings self.pretrained_bert.bert.embeddings.token_type_embeddings = new_embeddings assert mode in (None, 'entity_linking') self.mode = mode self.unfreeze() if debug_cuda: for m in self.modules(): m.register_forward_hook(diagnose_forward_hook) m.register_backward_hook(diagnose_backward_hook)
def evaluate(): if opt.part == 'table2pivot': corpus = Table2PivotCorpus(vocab_size=opt.vocab_size, max_len=opt.src_max_len, batch_size=opt.batch_size, log_dir=opt.dir, scale=opt.scale, mode=opt.mode) else: corpus = Pivot2TextCorpus(vocab_size=opt.vocab_size, src_max_len=opt.src_max_len, tgt_max_len=opt.tgt_max_len, batch_size=opt.batch_size, share=opt.share, log_dir=opt.dir, scale=opt.scale, append_rate=opt.append_rate, drop_rate=opt.drop_rate, blank_rate=opt.blank_rate, setting=opt.setting, mode=opt.mode, use_feature=opt.feature) model = Pivot(emb_size=opt.emb_size, key_emb_size=opt.key_emb_size, pos_emb_size=opt.pos_emb_size, hidden_size=opt.hidden_size, n_hidden=opt.n_hidden, n_block=opt.n_block, ff_size=opt.ff_size, n_head=opt.n_head, enc_layers=opt.enc_layers, dec_layers=opt.dec_layers, dropout=opt.dropout, bidirectional=opt.bidirectional, beam_size=opt.beam_size, max_decoding_step=opt.max_step, minimum_length=opt.minimum_length, label_smoothing=opt.label_smoothing, share=opt.share, part=opt.part, vocab=corpus.vocab, use_feature=opt.feature, arch=opt.arch) if opt.fp16: model.half() model = model.cuda(opt.gpu) model_state = torch.load(opt.restore, map_location=util.device_mapping(-1)) model.load_state_dict(model_state) predictor = Predictor(dataset=corpus.test_dataset, dataloader=corpus.test_loader, corpus=corpus, cuda_device=opt.gpu) predictor.evaluate(model)
def load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_dir: str = None The directory containing the serialized weights, parameters, and vocabulary of the model. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. _remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab, model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _load( cls, config: Params, serialization_dir: Union[str, PathLike], weights_file: Optional[Union[str, PathLike]] = None, cuda_device: int = -1, ) -> "Model": """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, "vocabulary") # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice) vocab = vocab_class.from_files(vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token")) model_params = config.get("model") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) return model
def best_model_state(self) -> Dict[str, Any]: if self._serialization_dir: logger.info("loading best weights") best_model_state_path = os.path.join(self._serialization_dir, "best.th") return torch.load(best_model_state_path, map_location=nn_util.device_mapping(-1)) else: logger.info( "cannot load best weights without `serialization_dir`, " "so you're just getting the last weights" ) return {}
def _load_weights(self): try: # TODO(change) # device = get_device_of(next(self.parameters())) device = -1 model_state = torch.load(self._weight_file, map_location=device_mapping(device)) except Exception: model_state = None self._load_char_embedding(model_state) self._load_cnn_weights(model_state) self._load_highway(model_state) self._load_projection(model_state)
def restore_model_state(model, model_path): assert os.path.exists(model_path) # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=device_mapping(-1)) if isinstance(model, (DataParallel, DistributedDataParallel)): model.module.load_state_dict(model_state) else: model.load_state_dict(model_state)
def load_encoder(encoder, ckpt_path): from src import device, cuda_device checkpoint = torch.load(ckpt_path, map_location=device_mapping(cuda_device)) model_state_dict = checkpoint["model"] encoder_state_dict = get_encoder_state_dict(model_state_dict, "encoder") encoder.load_state_dict(encoder_state_dict, strict=False) print(encoder_state_dict.keys()) log.info(f"Load pretrained model: {ckpt_path}") perplexity = re.findall(".*best(.*).pt", ckpt_path)[0] log.info("pretrain perplexity: %.2f", float(perplexity))
def test(self, data): """ Computes the metrics for data """ model_path = os.path.join(self._serialization_dir, "best.th") logger.info("Loading best model from {0}".format(model_path)) model_state = torch.load(model_path, map_location=util.device_mapping(-1)) self._model.load_state_dict(model_state) loss, num_batches = self._inference_loss(data, logger_string="Testing") metrics = self._get_metrics(loss, num_batches, reset=True) metric_names = metrics.keys() message_template = "%s %s : %3f" for name in metric_names: test_metric = metrics.get(name) logger.info(message_template, "Test", name, test_metric) return metrics
def load_checkpoint( model, generator, n_tr_batches, checkpoint_path, metric, optimizers ): if not isinstance(optimizers, list): optimizers = [optimizers] checkpoint = torch.load(checkpoint_path, map_location=device_mapping(cuda_device)) model.load_state_dict(checkpoint["model"], strict=False) step = checkpoint["step"] if metric is not None: metric.load_state_dict(checkpoint["metric"]) for i, opt in enumerate(optimizers): opt.load_state_dict(checkpoint["optimizer" + str(i)]) for _ in itertools.islice(generator, step % n_tr_batches): pass log.info(f"Load checkpoint: {checkpoint_path}") # return model, generator, step, metric, optimizers return step
def restore_checkpoint_pretrained(self, restore_bin: str): # Restore from a given model path state_dict = torch.load(restore_bin, map_location=device_mapping(-1)) if isinstance(self.model, DataParallel): model_to_load = self.model.module else: model_to_load = self.model own_state = model_to_load.state_dict() for name, param in state_dict.items(): if name not in own_state: print("Skipped:" + name) continue if isinstance(param, torch.nn.Parameter): # backwards compatibility for serialized parameters param = param.data try: own_state[name].copy_(param) except: print("Part load failed: " + name)
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab = Vocabulary.by_name(vocab_choice).from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _restore_checkpoint(self) -> Tuple[int, List[float]]: latest_checkpoint = self.find_latest_checkpoint() if latest_checkpoint is None: return 0, [] model_path, training_state_path = latest_checkpoint training_state = torch.load(training_state_path, map_location=util.device_mapping(-1)) if isinstance(training_state["epoch"], int): epoch_to_return = training_state["epoch"] + 1 else: epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1 if "val_metric_per_epoch" not in training_state: logger.warning( "trainer state `val_metric_per_epoch` not found, using empty list" ) val_metric_per_epoch: List[float] = [] else: val_metric_per_epoch = training_state["val_metric_per_epoch"] return epoch_to_return, val_metric_per_epoch
def _load( cls, config: Params, serialization_dir: str, weights_file: Optional[str] = None, cuda_device: int = -1, opt_level: Optional[str] = None, ) -> "Model": """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, "vocabulary") # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice) vocab = vocab_class.from_files( vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token") ) model_params = config.get("model") training_params = config.get("trainer", Params({})) opt_level = opt_level or training_params.get("opt_level") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() # If opt_level is not None (i.e. it exists in the loaded models params or was provided # as argument to this method), call amp.initialize on the loaded model. # Log a warning if amp is not installed or we are loading onto the cpu so that these # cases do not pass silently. if opt_level is not None: if amp is None: logger.warning( ( f"Apex must be installed to enable mixed-precision via amp." f" Got opt_level is not None (opt_level={opt_level}) but Apex is not installed." " Any further training or inference will happen at full-precision." ) ) if cuda_device == -1: logger.warning( ( f"A CUDA device must be specified to enable mixed-precision via amp." f" Got cuda_device=={cuda_device} but opt_level is not None (opt_level={opt_level})." " Any further training or inference will happen at full-precision." ) ) if amp is not None and cuda_device >= 0: model = amp.initialize(model, opt_level=opt_level) # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) return model
def _restore_checkpoint(self) -> Tuple[int, List[float]]: """ Restores a model from a serialization_dir to the last saved checkpoint. This includes an epoch count and optimizer state, which is serialized separately from model parameters. This function should only be used to continue training - if you wish to load a model for inference/load parts of a model into a new computation graph, you should use the native Pytorch functions: `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` If ``self._serialization_dir`` does not exist or does not contain any checkpointed weights, this function will do nothing and return 0. Returns ------- epoch: int The epoch at which to resume training, which should be one after the epoch in the saved training state. """ have_checkpoint = (self._serialization_dir is not None and any("model_state_epoch_" in x for x in os.listdir(self._serialization_dir))) if not have_checkpoint: # No checkpoint to restore, start at 0 return 0, [] serialization_files = os.listdir(self._serialization_dir) model_checkpoints = [x for x in serialization_files if "model_state_epoch" in x] # Get the last checkpoint file. Epochs are specified as either an # int (for end of epoch files) or with epoch and timestamp for # within epoch checkpoints, e.g. 5.2018-02-02-15-33-42 found_epochs = [ # pylint: disable=anomalous-backslash-in-string re.search("model_state_epoch_([0-9\.\-]+)\.th", x).group(1) for x in model_checkpoints ] int_epochs: Any = [] for epoch in found_epochs: pieces = epoch.split('.') if len(pieces) == 1: # Just a single epoch without timestamp int_epochs.append([int(pieces[0]), 0]) else: # has a timestamp int_epochs.append([int(pieces[0]), pieces[1]]) last_epoch = sorted(int_epochs, reverse=True)[0] if last_epoch[1] == 0: epoch_to_load = str(last_epoch[0]) else: epoch_to_load = '{0}.{1}'.format(last_epoch[0], last_epoch[1]) model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}.th".format(epoch_to_load)) training_state_path = os.path.join(self._serialization_dir, "training_state_epoch_{}.th".format(epoch_to_load)) # Load the parameters onto CPU, then transfer to GPU. # This avoids potential OOM on GPU for large models that # load parameters onto GPU then make a new GPU copy into the parameter # buffer. The GPU transfer happens implicitly in load_state_dict. model_state = torch.load(model_path, map_location=util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=util.device_mapping(-1)) self._model.load_state_dict(model_state) self._optimizer.load_state_dict(training_state["optimizer"]) # We didn't used to save `validation_metric_per_epoch`, so we can't assume # that it's part of the trainer state. If it's not there, an empty list is all # we can do. if "val_metric_per_epoch" not in training_state: logger.warning("trainer state `val_metric_per_epoch` not found, using empty list") val_metric_per_epoch: List[float] = [] else: val_metric_per_epoch = training_state["val_metric_per_epoch"] if isinstance(training_state["epoch"], int): epoch_to_return = training_state["epoch"] + 1 else: epoch_to_return = int(training_state["epoch"].split('.')[0]) + 1 # For older checkpoints with batch_num_total missing, default to old behavior where # it is unchanged. batch_num_total = training_state.get('batch_num_total') if batch_num_total is not None: self._batch_num_total = batch_num_total return epoch_to_return, val_metric_per_epoch
def _load( cls, config: Params, serialization_dir: Union[str, PathLike], weights_file: Optional[Union[str, PathLike]] = None, cuda_device: int = -1, ) -> "Model": """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, "vocabulary") # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice) vocab = vocab_class.from_files( vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token") ) model_params = config.get("model") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings/weights from. We're now _loading_ the model, so those weights will already be # stored in our model. We don't need any pretrained weight file or initializers anymore, # and we don't want the code to look for it, so we remove it from the parameters here. remove_keys_from_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() # Load state dict. We pass `strict=False` so PyTorch doesn't raise a RuntimeError # if the state dict is missing keys because we handle this case below. model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) missing_keys, unexpected_keys = model.load_state_dict(model_state, strict=False) # Modules might define a class variable called `authorized_missing_keys`, # a list of regex patterns, that tells us to ignore missing keys that match # any of the patterns. # We sometimes need this in order to load older models with newer versions of AllenNLP. def filter_out_authorized_missing_keys(module, prefix=""): nonlocal missing_keys for pat in getattr(module.__class__, "authorized_missing_keys", None) or []: missing_keys = [ k for k in missing_keys if k.startswith(prefix) and re.search(pat[len(prefix) :], k) is None ] for name, child in module._modules.items(): if child is not None: filter_out_authorized_missing_keys(child, prefix + name + ".") filter_out_authorized_missing_keys(model) if unexpected_keys or missing_keys: raise RuntimeError( f"Error loading state dict for {model.__class__.__name__}\n\t" f"Missing keys: {missing_keys}\n\t" f"Unexpected keys: {unexpected_keys}" ) return model
def from_params(cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'PtDistTrainer': all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) vocab = Vocabulary.from_files(params.vocabulary.directory_path) model = Model.from_params(vocab=vocab, params=params.pop('model')) model.extend_embedder_vocab() if is_master_rank(): vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') batch_size = params.iterator.batch_size trainer_params = params.pop("trainer") keys = [key for key in params] for key in keys: params.pop(key) params = trainer_params validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) pretrain_file = params.pop("pretrain_file", None) no_grad_regexes = params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) model = model.cuda(dist.get_rank()) if pretrain_file: model_state = torch.load(pretrain_file, map_location=nn_util.device_mapping( dist.get_rank())) model.load_state_dict(model_state) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] # print([n for n, p in model.named_parameters() if p.requires_grad]) optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=None) return cls(model, optimizer, train_data, validation_data, batch_size=batch_size, validation_metric=validation_metric, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, checkpointer=checkpointer)