def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "", embedding_sources_mapping: Dict[str, str] = None, in_fold = None, num_folds = None, ewc_weight=None) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- model : ``Model`` A model to fine tune. params : ``Params`` A parameter object specifying an AllenNLP Experiment serialization_dir : ``str`` The directory in which to save results and logs. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. embedding_sources_mapping: ``Dict[str, str]``, optional (default=None) mapping from model paths to the pretrained embedding filepaths used during fine-tuning. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model.extend_embedder_vocab(embedding_sources_mapping) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') dl_params = params.pop("data_loader") if test_data is not None: rand = random.Random(1234) test_data.index_with(vocab) shuffled_test = copy(test_data.instances) rand.shuffle(shuffled_test) extra_test = shuffled_test[:2000] keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": AllennlpDataset(extra_test, vocab)}) extra_test_loader = DataLoader.from_params(params.pop("test_data_loader", keys)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": test_data}) test_loader = DataLoader.from_params(params.pop("test_data_loader", keys)) master_model = model global_metrics = {} training_metrics = [] final_metrics = {} master_trainer = trainer_params.as_dict() if num_folds is not None: rand = random.Random(1234) fold_train = [] fold_test = [] fold_train_loader = [] fold_test_loader = [] shuffled_instances = copy(train_data.instances) rand.shuffle(shuffled_instances) kfold = KFold(n_splits=num_folds, random_state=None, shuffle=False) computed_folds = list(kfold.split(shuffled_instances)) for fold in range(num_folds): train_indexes, test_indexes = computed_folds[fold] new_train = [shuffled_instances[i] for i in train_indexes] new_test = [shuffled_instances[i] for i in test_indexes] fold_train.append(AllennlpDataset(new_train, vocab=vocab)) fold_test.append(AllennlpDataset(new_test, vocab=vocab)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": fold_test[-1]}) fold_test_loader.append(DataLoader.from_params(params.pop("fold_test_data_loader",keys))) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": fold_train[-1]}) fold_train_loader.append(DataLoader.from_params(params.pop("fold_train_data_loader", keys))) for fold in ([in_fold] if in_fold is not None else range(num_folds)): fold_model = deepcopy(master_model) eval_epoch_callback = EvalEpochCallback(fold, fold_test_loader[fold], test_loader, global_metrics) callbacks = [eval_epoch_callback] if ewc_weight is not None: ewc = EWC(extra_test_loader) def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]: ewc_loss = 0 if ewc.model.training: ewc_loss = ewc.penalty(ewc.model) ret = ewc.model.old_forward(*args, **kwargs) ret["loss"] += ewc_weight * ewc_loss return ret fold_model.old_forward = fold_model.forward fold_model.forward = ewc_forward callbacks.append(CallLossCallback(ewc)) trainer = Trainer.from_params(model=fold_model, serialization_dir=serialization_dir, data_loader=fold_train_loader[fold], train_data=train_data, validation_data=None, params=Params(deepcopy(master_trainer)), validation_data_loader=None, epoch_callbacks=callbacks) training_metrics.append(trainer.train()) del fold_model del trainer del eval_epoch_callback state = glob(serialization_dir+"/*.th") for file in state: logger.info("deleting state - {}".format(file)) os.unlink(file) else: callbacks = [] if ewc_weight is not None: ewc = EWC(extra_test_loader) def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]: ewc_loss = 0 if ewc.model.training: ewc_loss = ewc.penalty(ewc.model) ret = ewc.model.old_forward(*args, **kwargs) ret["loss"] += ewc_weight * ewc_loss return ret model.old_forward = model.forward model.forward = ewc_forward callbacks.append(CallLossCallback(ewc)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": train_data}) train_data.index_with(vocab) train_data_loader = DataLoader.from_params(params.pop("train_loader",keys)) if validation_data is not None: validation_data.index_with(vocab) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": validation_data}) validation_data_loader = DataLoader.from_params(params.pop("validation_loader", keys)) else: validation_data_loader = None if "finetune" in dir(model): model.finetune() logger.info("Fine tuning model") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, train_data=train_data, validation_data=None, params=Params(deepcopy(master_trainer)), validation_data_loader=validation_data_loader, epoch_callbacks=callbacks) training_metrics = trainer.train() archive_model(serialization_dir) final_metrics["fine_tune"] = global_metrics final_metrics["training"] = training_metrics metrics_json = json.dumps(final_metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model( model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "", embedding_sources_mapping: Dict[str, str] = None, ) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- model : ``Model`` A model to fine tune. params : ``Params`` A parameter object specifying an AllenNLP Experiment serialization_dir : ``str`` The directory in which to save results and logs. extend_vocab : ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. embedding_sources_mapping : ``Dict[str, str]``, optional (default=None) mapping from model paths to the pretrained embedding filepaths used during fine-tuning. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop("model", None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) vocabulary_params = params.pop("vocabulary", {}) if vocabulary_params.get("directory_path", None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances( vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), ) model.extend_embedder_vocab(embedding_sources_mapping) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets["train"] validation_data = all_datasets.get("validation") test_data = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names( model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_type = trainer_params.pop("type", "default") if trainer_type == "default": trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, ) else: raise ConfigurationError( "currently fine-tune only works with the default Trainer") evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty("base train command") try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( model, test_data, validation_iterator or iterator, cuda_device=trainer.cuda_device, batch_weight_key=batch_weight_key, ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def from_params( cls, params: Params, serialization_dir: str, recover: bool = False, model: Model = None, embedding_sources_mapping: Dict[str, str] = None, extend_vocab: bool = False, ) -> "MetaTrainerPieces": all_datasets = training_util.datasets_from_params(params) vocabulary_params = params.pop("vocabulary", {}) if model: if params.pop("model", None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the loaded model parameters." ) # TODO(mattg): This should be updated now that directory_path no longer exists. if vocabulary_params.get("directory_path", None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored because we already " "have a model with a vocabulary.") vocab = model.vocab else: vocab = None vocabulary_path = os.path.join(serialization_dir, "vocabulary") if not vocab or extend_vocab: vocab = MetaTrainerPieces.create_or_extend_vocab( datasets=all_datasets, params=params, recover=recover, vocab=vocab, vocabulary_params=vocabulary_params, vocabulary_path=vocabulary_path, ) if not model: model = Model.from_params(vocab=vocab, params=params.pop("model")) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab(embedding_sources_mapping) # Initializing the model can have side effect of expanding the vocabulary # Save the vocab only in the master. In the degenerate non-distributed # case, we're trivially the master. if is_master(): vocab.save_to_files(vocabulary_path) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_datas = all_datasets["train"] validation_datas = all_datasets.get("validation") test_datas = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) log_frozen_and_tunable_parameter_names(model) return cls( model=model, iterator=iterator, train_datasets=train_datas, validation_datasets=validation_datas, test_datasets=test_datas, validation_iterator=validation_iterator, params=trainer_params, )