def close(self) -> None: import wandb assert wandb.run is not None # set this here for resuming os.environ.update({"WANDB_RUN_ID": str(wandb.run.id)}) if self.save_model_archive: # we will have to create archive prematurely here. # the `train_model()` in `allennlp train` will # recreate the same model archive later. However, # this duplication cannot be avioded at this stage. logger.info("Archiving model before closing wandb.") archive_model( self.serialization_dir, include_in_archive=self.include_in_archive, ) if self._files_to_save_at_end: for fpath in self._files_to_save_at_end: self.wandb.save( # type: ignore os.path.join(self.serialization_dir, fpath), base_path=self.serialization_dir, policy="end", ) LogWriterCallback.close(self) if self.finish_on_end: wandb.finish()
def save_model(): # Save the model serialization_dir = 'model' config_file = os.path.join(serialization_dir, 'config.json') vocabulary_dir = os.path.join(serialization_dir, 'vocabulary') weights_file = os.path.join(serialization_dir, 'weights.th') os.makedirs(serialization_dir, exist_ok=True) params.to_file(config_file) vocab.save_to_files(vocabulary_dir) torch.save(model.state_dict(), weights_file) # Load the model loaded_params = Params.from_file(config_file) loaded_model = Model.load(loaded_params, serialization_dir, weights_file) loaded_vocab = loaded_model.vocab # Vocabulary is loaded in Model.load() # Make sure the predictions are the same loaded_preds = make_predictions(loaded_model, dataset_reader) assert original_preds == loaded_preds print('predictions matched') # Create an archive file archive_model(serialization_dir, weights='weights.th') # Unarchive from the file archive = load_archive(os.path.join(serialization_dir, 'model.tar.gz'))
def test_extra_files(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = { "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') } archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir / 'model.tar.gz') params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The temporary path should be accessible even after the load_archive # function returns. assert os.path.exists(params.get('train_data_path')) # The validation data path should be the same though. assert params.get('validation_data_path') == str( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
def save(self, directory: Union[str, Path]) -> str: """Saves the pipeline in the given directory as `model.tar.gz` file. Parameters ---------- directory Save the 'model.tar.gz' file to this directory. Returns ------- file_path Path to the 'model.tar.gz' file. """ if isinstance(directory, str): directory = Path(directory) directory.mkdir(exist_ok=True) with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) self.vocab.save_to_files(str(temp_path / "vocabulary")) torch.save(self._model.state_dict(), temp_path / "best.th") with (temp_path / "config.json").open("w") as file: json.dump( { "model": { "config": self.config.as_dict(), "type": "PipelineModel", } }, file, indent=4, ) archive_model(temp_path, archive_path=directory) return str(directory / "model.tar.gz")
def test_external_modules(self): sys.path.insert(0, self.TEST_DIR) original_serialization_dir = 'tests/fixtures/bidaf/serialization' serialization_dir = os.path.join(self.TEST_DIR, 'serialization') shutil.copytree(original_serialization_dir, serialization_dir) # Get original model config tf = tarfile.open( os.path.join(original_serialization_dir, 'model.tar.gz')) tf.extract('config.json', self.TEST_DIR) # Write out modified config file params = Params.from_file(os.path.join(self.TEST_DIR, 'config.json')) params['model']['type'] = 'bidaf-duplicate' config_file = os.path.join(serialization_dir, 'model_params.json') with open(config_file, 'w') as f: f.write(json.dumps(params.as_dict(quiet=True))) # And create an archive archive_model(serialization_dir) # Write out modified model.py module_dir = os.path.join(self.TEST_DIR, 'bidaf_duplicate') os.makedirs(module_dir) from allennlp.models.reading_comprehension import bidaf with open(bidaf.__file__) as f: code = f.read().replace("""@Model.register("bidaf")""", """@Model.register('bidaf-duplicate')""") with open(os.path.join(module_dir, 'model.py'), 'w') as f: f.write(code) archive_file = os.path.join(serialization_dir, 'model.tar.gz') raw_args = [ "evaluate", archive_file, "--evaluation-data-file", "tests/fixtures/data/squad.json" ] args = self.parser.parse_args(raw_args) # Raise configuration error without extra modules with pytest.raises(ConfigurationError): metrics = evaluate_from_args(args) # Specify the additional module raw_args.extend(['--include-package', 'bidaf_duplicate']) args = self.parser.parse_args(raw_args) metrics = evaluate_from_args(args) assert metrics.keys() == { 'span_acc', 'end_acc', 'start_acc', 'em', 'f1' } sys.path.remove(self.TEST_DIR)
def main(args): os.makedirs( os.path.dirname(args.output_file) if os.path.dirname(args.output_file) != "" else ".", exist_ok=True, ) if args.weights_file is None: archive_model(args.model_dir, archive_path=args.output_file) else: archive_model(args.model_dir, weights=args.weights_file, archive_path=args.output_file)
def test_archive_model_uses_archive_path(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Use a new path. archive_model(serialization_dir=serialization_dir, archive_path=serialization_dir / "new_path.tar.gz") archive = load_archive(serialization_dir / 'new_path.tar.gz') assert archive
def train_model( train_fp: Path, dev_fp: Path, model_fp: Path, vocab_data_fps: Optional[List[Path]] = None) -> Tuple[Model, Params]: ''' :param train_fp: The Traning dataset file path :param dev_fp: The development dataset file path :param model_fp: The json file that describes the model :param vocab_data_fps: An optional List of additional dataset files that will be used to create the models vocab :returns: A tuple containing the Trained model and an object that describes the model. ''' set_random_env() model_params = Params.from_file(model_fp) emotion_dataset_reader = DatasetReader.from_params( model_params.pop('dataset_reader')) # Data train_dataset = emotion_dataset_reader.read(cached_path(str(train_fp))) dev_dataset = emotion_dataset_reader.read(cached_path(str(dev_fp))) vocab_datasets = [train_dataset, dev_dataset] if vocab_data_fps: for vocab_data_fp in vocab_data_fps: vocab_datasets.append( emotion_dataset_reader.read(cached_path(str(vocab_data_fp)))) vocab_data = [] for vocab_dataset in vocab_datasets: vocab_data.extend(vocab_dataset) vocab = Vocabulary.from_instances(vocab_data) emotion_model = Model.from_params(vocab=vocab, params=model_params.pop('model')) data_iter = DataIterator.from_params(model_params.pop('iterator')) data_iter.index_with(vocab) # Trainer with tempfile.TemporaryDirectory() as serial_dir: trainer_params = model_params.pop('trainer') trainer = Trainer.from_params(model=emotion_model, serialization_dir=serial_dir, iterator=data_iter, train_data=train_dataset, validation_data=dev_dataset, params=trainer_params) _ = trainer.train() temp_config_fp = str(Path(serial_dir, CONFIG_NAME).resolve()) Params.from_file(model_fp).to_file(temp_config_fp) vocab.save_to_files(Path(serial_dir, "vocabulary").resolve()) archive_model(serial_dir, files_to_archive=model_params.files_to_archive) model_archive = load_archive(serial_dir, cuda_device=0) return model_archive.model, model_archive.config
def test_loading_serialization_directory_with_extra_files(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data original_train_data_path = str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') files_to_archive = {"train_data_path": original_train_data_path} archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir) params = archive.config # We're loading from a directory, so retain the original path. assert params.get('train_data_path') == original_train_data_path
def _save_checkpoint(self, epoch: int, val_metric_per_epoch: List[float], is_best: Optional[bool] = None) -> None: """ Saves a checkpoint of the model to self._serialization_dir. Is a no-op if self._serialization_dir is None. Parameters ---------- epoch : int, required. The epoch of training. is_best: bool, optional (default = None) A flag which causes the model weights at the given epoch to be copied to a "best.th" file. The value of this flag should be based on some validation metric computed by your model. """ if self._serialization_dir is not None: model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}.th".format(epoch)) model_state = self._model.state_dict() torch.save(model_state, model_path) training_state = { 'epoch': epoch, 'val_metric_per_epoch': val_metric_per_epoch, 'optimizer': self._optimizer.state_dict() } torch.save( training_state, os.path.join(self._serialization_dir, "training_state_epoch_{}.th".format(epoch))) if is_best: logger.info( "Best validation performance so far. " "Copying weights to '%s/best.th'.", self._serialization_dir) shutil.copyfile( model_path, os.path.join(self._serialization_dir, "best.th")) archive_model(self._serialization_dir, files_to_archive=self._files_to_archive)
def test_extra_files(self): serialization_dir = self.TEST_DIR / u'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = {u"train_data_path": unicode(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')} archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir / u'model.tar.gz') params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get(u'train_data_path').endswith(u'/fta/train_data_path') # The validation data path should be the same though. assert params.get(u'validation_data_path') == unicode(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
def test_extra_files(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = {"train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')} archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir / 'model.tar.gz') params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The validation data path should be the same though. assert params.get('validation_data_path') == str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
def test_extra_files(self): serialization_dir = os.path.join(self.TEST_DIR, 'serialization') # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = { "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv' } archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(os.path.join(serialization_dir, 'model.tar.gz')) params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The validation data path should be the same though. assert params.get('validation_data_path' ) == 'tests/fixtures/data/sequence_tagging.tsv'
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def train_model(params: Params, serialization_dir: str, cuda_device: int, train_data_path: str, validation_data_path: str, test_data_path: str, file_friendly_logging: bool = False) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # all_datasets = datasets_from_params(params) all_datasets = datasets_from_args(params, train_data_path, validation_data_path, test_data_path) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) if cuda_device >= 0: model = model.cuda(cuda_device) # iterator = DataIterator.from_params(params.pop("iterator")) # iterator.index_with(vocab) train_iterator = DataIterator.from_params(params.pop("train_iterator")) val_iterator = DataIterator.from_params(params.pop("val_iterator")) train_iterator.index_with(vocab) val_iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, train_iterator, val_iterator, cuda_device, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) # params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, val_iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, results_fn: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Tuple[Model, Dict[str, Any]]: prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(results_dir, results_fn), metrics, log=True) return best_model, metrics
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, include_package: List[str] = None, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. node_rank : ``int``, optional Rank of the node world_size : ``int``, optional The number of processes involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 evaluate_on_test = params.pop_bool("evaluate_on_test", False) if distributed: # Since the worker is spawned and not forked, the extra imports # need to be done again. if include_package is not None: for package_name in include_package: import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(gpu_id) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise if master: if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer.cuda_device, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command." ) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not distributed: return trainer.model return None # to make mypy happy
def train_model( params: Params, serialization_dir: Union[str, PathLike], recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, dry_run: bool = False, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data and training parameters also specified in that object, and saves the results in `serialization_dir`. # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. recover : `bool`, optional (default=`False`) If `True`, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see `Model.from_archive`. force : `bool`, optional (default=`False`) If `True`, we will overwrite the serialization directory if it already exists. node_rank : `int`, optional Rank of the current node in distributed training include_package : `List[str]`, optional In distributed mode, extra packages mentioned will be imported in trainer workers. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, include_package=include_package, dry_run=dry_run, file_friendly_logging=file_friendly_logging, ) if not dry_run: archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs # Creating `Vocabulary` objects from workers could be problematic since # the data loaders in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. vocab_dir = os.path.join(serialization_dir, "vocabulary") if recover: vocab = Vocabulary.from_files(vocab_dir) else: vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir, print_statistics=dry_run) params["vocabulary"] = { "type": "from_files", "directory": vocab_dir, "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } logging.info( "Switching to distributed training mode since multiple GPUs are configured | " f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, include_package, dry_run, node_rank, master_addr, master_port, world_size, device_ids, file_friendly_logging, ), nprocs=num_procs, ) if dry_run: return None else: archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def run(_): """Run model.""" # Imports are required to make Registrable modules visible without passing parameter util.import_module_and_submodules("combo.commands") util.import_module_and_submodules("combo.models") util.import_module_and_submodules("combo.training") if FLAGS.mode == "train": checks.file_exists(FLAGS.config_path) params = common.Params.from_file(FLAGS.config_path, ext_vars=_get_ext_vars()) model_params = params.get("model").as_ordered_dict() serialization_dir = tempfile.mkdtemp(prefix="allennlp", dir=FLAGS.serialization_dir) model = train.train_model(params, serialization_dir=serialization_dir, file_friendly_logging=True) logger.info(f"Training model stored in: {serialization_dir}") if FLAGS.finetuning_training_data_path: for f in FLAGS.finetuning_training_data_path: checks.file_exists(f) # Loading will be performed from stored model.tar.gz del model if torch.cuda.is_available(): torch.cuda.empty_cache() params = common.Params.from_file( FLAGS.config_path, ext_vars=_get_ext_vars(finetuning=True)) # Replace model definition with pretrained archive params["model"] = { "type": "from_archive", "archive_file": serialization_dir + "/model.tar.gz", } serialization_dir = tempfile.mkdtemp(prefix="allennlp", suffix="-finetuning", dir=FLAGS.serialization_dir) model = train.train_model(params.duplicate(), serialization_dir=serialization_dir, file_friendly_logging=True) # Make finetuning model serialization independent from training serialization # Storing model definition instead of archive params["model"] = model_params params.to_file( os.path.join(serialization_dir, archival.CONFIG_NAME)) archival.archive_model(serialization_dir) logger.info(f"Finetuned model stored in: {serialization_dir}") if FLAGS.test_path and FLAGS.output_file: checks.file_exists(FLAGS.test_path) params = common.Params.from_file( FLAGS.config_path, ext_vars=_get_ext_vars())["dataset_reader"] params.pop("type") dataset_reader = dataset.UniversalDependenciesDatasetReader.from_params( params) predictor = predict.SemanticMultitaskPredictor( model=model, dataset_reader=dataset_reader) test_trees = dataset_reader.read(FLAGS.test_path) with open(FLAGS.output_file, "w") as file: for tree in test_trees: file.writelines( api.sentence2conllu( predictor.predict_instance(tree), keep_semrel=dataset_reader.use_sem).serialize()) else: use_dataset_reader = FLAGS.conllu_format predictor = _get_predictor() if FLAGS.input_file == "-": use_dataset_reader = False predictor.without_sentence_embedding = True if use_dataset_reader: predictor.line_to_conllu = True if FLAGS.silent: logging.getLogger("allennlp.common.params").disabled = True manager = allen_predict._PredictManager( predictor, FLAGS.input_file, FLAGS.output_file, FLAGS.batch_size, not FLAGS.silent, use_dataset_reader, ) manager.run()
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, batch_weight_key: str = "", ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see ``Model.from_archive``. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. # Returns best_model : ``Model`` The model with the best epoch weights. """ training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, include_package=include_package, batch_weight_key=batch_weight_key, ) archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since # the data iterators in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. if params.get("vocabulary", Params({})).get("type", "") != "from_files": vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir) params["vocabulary"] = { "type": "from_files", "directory": os.path.join(serialization_dir, "vocabulary"), "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, include_package, batch_weight_key, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
params.to_file(serialize_config_file) dist.barrier() params = ConstParams.from_file(serialize_config_file) log_dir = os.path.join(serialization_dir, str(dist.get_rank())) os.makedirs(log_dir, exist_ok=True) stdout_handler = prepare_global_logging(log_dir, file_friendly_logging=False) prepare_environment(params) cuda_device = params.trainer.get('cuda_device', -1) check_for_gpu(cuda_device) trainer_type = params.trainer.type trainer = TrainerBase.from_params(params, serialization_dir, recover) params_cnt, params_trainable_cnt = count_parameters(trainer.model) print("all params cnt: ", params_cnt) print("all trainable params cnt: ", params_trainable_cnt) metrics = trainer.train() cleanup_global_logging(stdout_handler) if is_master_rank: archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def train_model(params, serialization_dir, file_friendly_logging=False, recover=False, model="bidaf"): """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ print("Starting training models...") prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) print("get all of the dataset.") datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") print("creatig vocaburary...") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) if model == "self": model = BiDAFSelfAttention.from_params(vocab, params.pop("model")) else: model = BidirectionalAttentionFlow.from_params(vocab, params.pop("model")) print("Initialized a BiDAF model.") # This is for debugging. print(model) print(serialization_dir) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) print("create iterator") train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') print("initalizing a trainer") trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def _train_worker( process_rank: int, params: Params, serialization_dir: Union[str, PathLike], include_package: List[str] = None, dry_run: bool = False, node_rank: int = 0, primary_addr: str = "127.0.0.1", primary_port: int = 29500, world_size: int = 1, distributed_device_ids: List[int] = None, file_friendly_logging: bool = False, include_in_archive: List[str] = None, distributed_params: Optional[Params] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the `Model` object and in distributed training, nothing is returned. # Parameters process_rank : `int` The process index that is initialized using the GPU device id. params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. include_package : `List[str]`, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. node_rank : `int`, optional Rank of the node. primary_addr : `str`, optional (default=`"127.0.0.1"`) Address of the primary node for distributed training. primary_port : `str`, optional (default=`"29500"`) Port of the primary node for distributed training. world_size : `int`, optional The number of processes involved in distributed training. distributed_device_ids: `List[str]`, optional IDs of the devices used involved in distributed training. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. include_in_archive : `List[str]`, optional Paths relative to `serialization_dir` that should be archived in addition to the default ones. distributed_params : `Optional[Params]`, optional Additional distributed params. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in distributed training or in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging common_logging.prepare_global_logging( serialization_dir, rank=process_rank, world_size=world_size, ) common_util.prepare_environment(params) distributed = world_size > 1 primary = process_rank == 0 include_package = include_package or [] ddp_accelerator: Optional[DdpAccelerator] = None if distributed: assert distributed_device_ids is not None assert distributed_params is not None # Since the worker is spawned and not forked, the extra imports need to be done again. # Both the ones from the plugins and the ones from `include_package`. import_plugins() for package_name in include_package: common_util.import_module_and_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a primary in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = int(distributed_device_ids[process_rank]) # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["local_rank"] = process_rank params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True if gpu_id >= 0: torch.cuda.set_device(gpu_id) dist.init_process_group( backend="nccl", init_method=f"tcp://{primary_addr}:{primary_port}", world_size=world_size, rank=global_rank, ) else: dist.init_process_group( backend="gloo", init_method=f"tcp://{primary_addr}:{primary_port}", world_size=world_size, rank=global_rank, ) if "ddp_accelerator" in distributed_params: ddp_accelerator_params = distributed_params.pop("ddp_accelerator") ddp_accelerator = DdpAccelerator.from_params( ddp_accelerator_params, local_rank=process_rank, world_size=world_size, cuda_device=gpu_id, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") train_loop = TrainModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=process_rank, ddp_accelerator=ddp_accelerator, ) if dry_run: return None try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = train_loop.run() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if primary: best_weights_path = train_loop.trainer.get_best_weights_path() if best_weights_path is None: logging.info( "Training interrupted by the user, and no best model has been saved. " "No model archive created.") else: logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model( serialization_dir, weights=best_weights_path, include_in_archive=include_in_archive, ) raise if primary: train_loop.finish(metrics) if not distributed: return train_loop.model return None
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, node_rank: int = 0, include_package: List[str] = None, ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. # Returns best_model : ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, recover=recover, cache_directory=cache_directory, cache_prefix=cache_prefix, include_package=include_package, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(master_port) os.environ["WORLD_SIZE"] = str(world_size) logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since the data iterators # in each worker will yield only `rank` specific instances. Hence it is safe to construct # the vocabulary and write it to disk before initializing the distributed context. The workers # will load the vocabulary from the path specified. make_vocab_from_params(params.duplicate(), serialization_dir) params["vocabulary"] = { "directory_path": os.path.join(serialization_dir, "vocabulary"), "extend": False, # vocab extension would have been done above } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, recover, cache_directory, cache_prefix, include_package, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) model = Model.load(params, serialization_dir) return model
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: List[Dataset] = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets.append(test_data) datasets_in_vocab.append("test") else: test_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), Dataset([instance for dataset in all_datasets for instance in dataset.instances])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def train_model(params: Params, serialization_dir: str, selector: str, num_ensemble_models: Optional[int], file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model_params = params.pop('model') if selector == 'qbc': assert num_ensemble_models is not None models_list = [Model.from_params(vocab=vocab, params=model_params.duplicate()) for i in range(num_ensemble_models)] ensemble_model = CorefEnsemble(models_list) model = ensemble_model.submodels[0] else: model = Model.from_params(vocab=vocab, params=model_params) ensemble_model = None # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop("type") trainer = ALCorefTrainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator, ensemble_model=ensemble_model) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics, query_info = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) best_model = None logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value return best_model, metrics, query_info
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.params.get('trainer').get('cuda_device', -1)) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
#with open(sentence_output, "w") as f: #with open(sentence_file, "r") as sentences: #for sentence in sentences: #jsonl = '{' + '"' + 'sentence' + '"' + ' : ' + '"' + sentence.replace('\n','') + '"' + '}' #f.write(jsonl) #f.write('\n') #print('sents staged') with open(model_path + 'vocabulary/labels.txt', "r") as vocab: for label in vocab: labels.append(label.rstrip()) #including this redirection in order to load the best model only if os.path.exists(model_path + 'model.tar.gz'): os.remove(model_path + 'model.tar.gz') archive_model(model_path, 'best.th') print('best model archived') archive = load_archive(model_path + 'model.tar.gz') predictor = Predictor.from_archive(archive, 'oie_crf') #iterate through sentences instance_iterator = 0 #sentences = 'tests/fixtures/oie_test.jsonl' print('starting to predict on sents') with open(sentence_output, "r") as sents: with open(output_path, 'a') as f: for sent in sents: inp = json.loads(sent) #run model on sentence
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, include_package: List[str] = None, batch_weight_key: str = "", node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. node_rank : ``int``, optional Rank of the node. master_addr : ``str``, optional (default="127.0.0.1") Address of the master node for distributed training. master_port : ``str``, optional (default="29500") Port of the master node for distributed training. world_size : ``int``, optional The number of processes involved in distributed training. distributed_device_ids: ``List[str]``, optional IDs of the devices used involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ common_util.prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) common_util.prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 include_package = include_package or [] if distributed: # Since the worker is spawned and not forked, the extra imports need to be done again. import_plugins() for package_name in include_package: common_util.import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a master in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(int(gpu_id)) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") train_loop = TrainModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=process_rank, batch_weight_key=batch_weight_key, ) try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = train_loop.run() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir) raise if master: train_loop.finish(metrics) if not distributed: return train_loop.model return None # to make mypy happy
def modified_train_model(serialization_dir, training_config_filename, cuda_device=-1, file_friendly_logging: bool = False) -> Model: """ Function not currently in use. This is from back when I was trying to keep each successive addition to the model's training in the same serialization directory. Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ model, params, prev_optimizer_params, cur_optimizer_params = \ load_model_from_serialization_dir(serialization_dir, training_config_filename, cuda_device=cuda_device) prepare_environment(params) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) params.pop('model') iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) list_of_cur_optimizer_param_keys = [ key for key in cur_optimizer_params.as_flat_dict().keys() ] list_of_prev_optimizer_param_keys = [ key for key in prev_optimizer_params.as_flat_dict().keys() ] optimizer_params_match = True for key in list_of_cur_optimizer_param_keys: if key not in list_of_prev_optimizer_param_keys: optimizer_params_match = False break for key in list_of_prev_optimizer_param_keys: if key not in list_of_cur_optimizer_param_keys: optimizer_params_match = False break if not optimizer_params_match: # a list of each p is what will be passed to the optimizer constructor while constructing Trainer-- # adjust if necessary (i.e., if we changed optimizers) model_params = [[n, p] for n, p in model.named_parameters() if p.requires_grad] assert "parameter_groups" not in list_of_cur_optimizer_param_keys, \ "Current way of dealing with optimizer change doesn't take parameter groups into account" assert "parameter_groups" not in list_of_prev_optimizer_param_keys, \ "Current way of dealing with optimizer change doesn't take parameter groups into account" for param_tup in model_params: # modify the second element of param_tup in-place (it's a dict) to match the keys specified in # cur_optimizer_params param_dict = param_tup[1] keys_to_del = [] keys_already_in_dict = [] try: for key in param_dict.keys(): if not key in list_of_cur_optimizer_param_keys: keys_to_del.append(key) else: keys_already_in_dict.append(key) for key in keys_to_del: del param_dict[key] for key_to_have in list_of_cur_optimizer_param_keys: if key_to_have != "type" and key_to_have not in keys_already_in_dict: param_dict[key_to_have] = cur_optimizer_params.get( key_to_have) except: pass trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model