def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False): # pylint: disable=arguments-differ typ3 = params.get("trainer", {}).pop("type", "default") if typ3 == "default": # Special logic to keep old from_params behavior. from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member return Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) else: klass = TrainerBase.by_name(typ3) # Explicit check to prevent recursion. is_overriden = klass.from_params.__func__ != TrainerBase.from_params.__func__ # type: ignore assert is_overriden, f"Class {klass.__name__} must override `from_params`." return klass.from_params(params, serialization_dir, recover)
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False): # pylint: disable=arguments-differ pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member return NoOpTrainer(serialization_dir, pieces.model)
def test_extend_vocab(self): vocab = Vocabulary(tokens_to_add={"tokens": ["This", "extend"]}) params = Params({}) token_indexer = SingleIdTokenIndexer() instance1 = Instance( fields={ "tokens": TextField( [ Token("This"), Token("is"), Token("a"), Token("test"), Token(".") ], {"tokens": token_indexer}, ), "label": LabelField("T"), }) instance2 = Instance( fields={ "tokens": TextField( [ Token("This"), Token("is"), Token("another"), Token("test"), Token(".") ], {"tokens": token_indexer}, ), "label": LabelField("F"), }) datasets = {"train": [instance1, instance2]} vocabulary_params = Params({}) vocabulary_path = "/tmp/path_should_not_be_used" vocab = TrainerPieces.create_or_extend_vocab( params=params, datasets=datasets, vocabulary_params=vocabulary_params, vocabulary_path=vocabulary_path, vocab=vocab, recover=False, ) # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default. self.assertEqual( vocab.get_vocab_size("tokens"), len({"This", "is", "a", "another", "test", "extend", "."}) + 2, ) self.assertEqual(vocab.get_vocab_size("labels"), len({"T", "F"}))
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "CallbackTrainer": pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) model = pieces.model params = pieces.params validation_iterator = pieces.validation_iterator or pieces.iterator shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) callbacks_params = params.pop("callbacks", []) callbacks: List[Callback] = [ Callback.from_params( params=callback_params, model=model, optimizer=optimizer, instances=pieces.train_dataset, iterator=pieces.iterator, shuffle=shuffle, validation_data=pieces.validation_dataset, validation_iterator=validation_iterator, serialization_dir=serialization_dir, ) for callback_params in callbacks_params ] params.assert_empty(cls.__name__) return cls( model, pieces.train_dataset, pieces.iterator, optimizer, num_epochs=num_epochs, shuffle=shuffle, serialization_dir=serialization_dir, cuda_device=cuda_device, callbacks=callbacks, )
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ): pieces = TrainerPieces.from_params(params, serialization_dir, recover) return NoOpTrainer(serialization_dir, pieces.model)
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False): from allennlp.training.trainer_pieces import TrainerPieces pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member return SplitUncertainModelTrainer.this_from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator)
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None): print(params) pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) return _from_params(cls, pieces.model, serialization_dir, pieces.iterator, pieces.train_dataset, pieces.validation_dataset, pieces.params, pieces.validation_iterator)
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ): typ3 = params.get("trainer", {}).pop("type", "default") if typ3 == "default": # Special logic to keep old from_params behavior. from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) return Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) else: klass = TrainerBase.by_name(typ3) # Explicit check to prevent recursion. is_overriden = ( klass.from_params.__func__ != TrainerBase.from_params.__func__ # type: ignore ) assert is_overriden, f"Class {klass.__name__} must override `from_params`." return klass.from_params(params, serialization_dir, recover, cache_directory, cache_prefix)
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, include_package: List[str] = None, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. node_rank : ``int``, optional Rank of the node world_size : ``int``, optional The number of processes involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 evaluate_on_test = params.pop_bool("evaluate_on_test", False) if distributed: # Since the worker is spawned and not forked, the extra imports # need to be done again. if include_package is not None: for package_name in include_package: import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(gpu_id) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise if master: if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer.cuda_device, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command." ) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not distributed: return trainer.model return None # to make mypy happy
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, local_rank: int = 0, ) -> "Trainer": from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces config = dict(as_flat_dict(params.as_dict())) pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model serialization_dir = serialization_dir iterator = pieces.iterator train_data = pieces.train_dataset validation_data = pieces.validation_dataset params = pieces.params validation_iterator = pieces.validation_iterator patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters ) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params) else: momentum_scheduler = None if "checkpointer" in params: if ( "keep_serialized_model_every_num_seconds" in params or "num_serialized_models_to_keep" in params ): raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods." ) checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None ) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds, ) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) distributed = params.pop_bool("distributed", False) world_size = params.pop_int("world_size", 1) num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1) lang_mean_dir = params.pop("ft_lang_mean_dir", None) if lang_mean_dir: try: assert model._lang_means is not None lang_mean = get_lang_mean(lang_mean_dir) model.add_ft_lang_mean_to_lang_means(lang_mean) except (AttributeError, AssertionError) as e: pass writer = None wandb_config = params.pop("wandb", None) if wandb_config is not None: writer = WandBWriter(config, model, wandb_config) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, writer=writer, )
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool, cache_directory: str, cache_prefix: str) -> 'MatchingTrainer': # pylint: disable=arguments-differ pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) model = pieces.model iterator = pieces.iterator train_data = pieces.train_dataset validation_data = pieces.validation_dataset params = pieces.params validation_iterator = pieces.validation_iterator patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) retrieve_text = params.pop("retrieve_text", True) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, retrieve_text=retrieve_text, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
return self.name if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", type=Action, choices=list(Action), required=True) parser.add_argument("--config", required=True) parser.add_argument("--serialization-dir", required=True) parser.add_argument("--batch-count", type=int, default=0) parser.add_argument("--assume-multiprocess-types", action="store_true") args = parser.parse_args() params = Params.from_file(args.config) pieces = TrainerPieces.from_params(params, args.serialization_dir) raw_generator = pieces.iterator(pieces.train_dataset, num_epochs=1, shuffle=True) if args.action is Action.log: log_iterable(raw_generator, args.assume_multiprocess_types) elif args.action is Action.time: time_iterable(raw_generator, args.batch_count) elif args.action is Action.first: time_to_first(raw_generator) else: raise Exception(f"Unaccounted for action {action}")
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, ) -> "CallbackTrainer": pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model params = pieces.params validation_iterator = pieces.validation_iterator or pieces.iterator shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) callbacks_params = params.pop("callbacks", []) callbacks: List[Callback] = [ Callback.from_params( params=callback_params, model=model, optimizer=optimizer, instances=pieces.train_dataset, iterator=pieces.iterator, shuffle=shuffle, validation_data=pieces.validation_dataset, validation_iterator=validation_iterator, serialization_dir=serialization_dir, ) for callback_params in callbacks_params ] distributed = params.pop_bool("distributed", False) world_size = params.pop_int("world_size", 1) if distributed: rank = cuda_device else: rank = 0 params.assert_empty(cls.__name__) return cls( model, pieces.train_dataset, pieces.iterator, optimizer, num_epochs=num_epochs, shuffle=shuffle, serialization_dir=serialization_dir, cuda_device=cuda_device, callbacks=callbacks, distributed=distributed, rank=rank, world_size=world_size, )
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> Model: create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if True: # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) logger.info("Using MultiTrainer") from lm.trainining.MultiTaskTrainer import MultiTaskTrainer # MultiTrainer trainer = MultiTaskTrainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") """ The only main difference """ print("Using MultuTrainer") logger.info("Using MultiTrainer") trainer = MultiTrainer.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'Trainer': # pylint: disable=arguments-differ # We have to call TrainerPieces.from_params since we are using our own Trainer pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model serialization_dir = serialization_dir iterator = pieces.iterator train_data = pieces.train_dataset validation_data = pieces.validation_dataset validation_iterator = pieces.validation_iterator params = pieces.params patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) accumulation_steps = params.pop("accumulation_steps", 0) opt_level = params.pop("opt_level", "O1") num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) half_precision = params.pop("half_precision", False) warmup_proportion = params.pop("warmup_proportion", None) pretrained_model = params.pop("pretrained_model", None) if pretrained_model: logger.info('Loading pretrained model from', pretrained_model) model = load_archive(pretrained_model).model model._discriminative_loss_weight = 1 # TODO: fix this hack if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, accumulation_steps=accumulation_steps, opt_level=opt_level, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, half_precision=half_precision, warmup_proportion=warmup_proportion)