def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError( "The 'vocabulary' directory in the provided serialization directory is non-empty" ) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances=instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop("model")) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) log_frozen_and_tunable_parameter_names(model)
def from_partial_objects( cls, name: str, model: Model, num_epochs: Optional[int] = None, batches_per_epoch: Optional[int] = None, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: float = None, grad_clipping: float = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, ) -> "ComponentOptimizer": parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) common_util.log_frozen_and_tunable_parameter_names(model) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) return cls(name=name, model=model, optimizer=optimizer_, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_)
def from_partial_objects( cls, model: Model, serialization_dir: str, data_loader: DataLoader, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, opt_level: Optional[str] = None, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) try: batches_per_epoch = len(data_loader) except TypeError: # If the dataset is lazy, it won't have a length. batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) tensorboard_writer_ = tensorboard_writer.construct( ) or TensorboardWriter(serialization_dir) return cls( model, optimizer_, data_loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, tensorboard_writer=tensorboard_writer_, checkpointer=checkpointer_, moving_average=moving_average_, batch_callbacks=batch_callbacks, epoch_callbacks=epoch_callbacks, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, opt_level=opt_level, )
def from_partial_objects( cls, model: Model, serialization_dir: str, data_loader: DataLoader, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, validation_metric: Union[str, List[str]] = "-loss", num_epochs: int = 20, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: float = None, grad_clipping: float = None, distributed: bool = False, world_size: int = 1, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = Lazy(Optimizer.default), learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = Lazy(Checkpointer), callbacks: List[Lazy[TrainerCallback]] = None, enable_default_callbacks: bool = True, run_sanity_checks: bool = True, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ if cuda_device is None: from torch import cuda if cuda.device_count() > 0: cuda_device = 0 else: cuda_device = -1 check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) common_util.log_frozen_and_tunable_parameter_names(model) batches_per_epoch: Optional[int] try: batches_per_epoch = len(data_loader) batches_per_epoch = math.ceil(batches_per_epoch / num_gradient_accumulation_steps) except TypeError: batches_per_epoch = None moving_average_ = ( None if moving_average is None else moving_average.construct(parameters=parameters) ) learning_rate_scheduler_ = ( None if learning_rate_scheduler is None else learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch ) ) momentum_scheduler_ = ( None if momentum_scheduler is None else momentum_scheduler.construct(optimizer=optimizer_) ) checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir) callbacks_: List[TrainerCallback] = [] for callback_ in callbacks or []: callbacks_.append(callback_.construct(serialization_dir=serialization_dir)) return cls( model, optimizer_, data_loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, moving_average=moving_average_, callbacks=callbacks_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, use_amp=use_amp, enable_default_callbacks=enable_default_callbacks, run_sanity_checks=run_sanity_checks, )
def from_partial_objects( cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_iterator: DataIterator = None, validation_data: Iterable[Instance] = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", shuffle: bool = True, num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, model_save_interval: float = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: int = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) batches_per_epoch = iterator.get_num_batches(train_data) if batches_per_epoch == 1: # get_num_batches returns 1 when it can't determine the answer batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) return cls( model, optimizer_, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, )
def from_params( cls, params: Params, serialization_dir: str, recover: bool = False, model: Model = None, embedding_sources_mapping: Dict[str, str] = None, extend_vocab: bool = False, ) -> "MetaTrainerPieces": all_datasets = training_util.datasets_from_params(params) vocabulary_params = params.pop("vocabulary", {}) if model: if params.pop("model", None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the loaded model parameters." ) # TODO(mattg): This should be updated now that directory_path no longer exists. if vocabulary_params.get("directory_path", None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored because we already " "have a model with a vocabulary.") vocab = model.vocab else: vocab = None vocabulary_path = os.path.join(serialization_dir, "vocabulary") if not vocab or extend_vocab: vocab = MetaTrainerPieces.create_or_extend_vocab( datasets=all_datasets, params=params, recover=recover, vocab=vocab, vocabulary_params=vocabulary_params, vocabulary_path=vocabulary_path, ) if not model: model = Model.from_params(vocab=vocab, params=params.pop("model")) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab(embedding_sources_mapping) # Initializing the model can have side effect of expanding the vocabulary # Save the vocab only in the master. In the degenerate non-distributed # case, we're trivially the master. if is_master(): vocab.save_to_files(vocabulary_path) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_datas = all_datasets["train"] validation_datas = all_datasets.get("validation") test_datas = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) log_frozen_and_tunable_parameter_names(model) return cls( model=model, iterator=iterator, train_datasets=train_datas, validation_datasets=validation_datas, test_datasets=test_datas, validation_iterator=validation_iterator, params=trainer_params, )
def run( # type: ignore self, model: Lazy[Model], dataset: DatasetDict, data_loader: Lazy[TangoDataLoader], optimizer: Lazy[Optimizer], validation_data_loader: Optional[Lazy[TangoDataLoader]] = None, training_split: str = "train", validation_split: Optional[str] = None, patience: Optional[int] = None, validation_metric: Union[str, List[str]] = "-loss", num_epochs: int = 20, checkpointer: Optional[Lazy[Checkpointer]] = None, grad_norm: Union[float, bool] = False, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[Lazy[LearningRateScheduler]] = None, momentum_scheduler: Optional[Lazy[MomentumScheduler]] = None, moving_average: Optional[Lazy[MovingAverage]] = None, callbacks: List[Lazy[TrainerCallback]] = None, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, enable_default_callbacks: bool = True, run_confidence_checks: bool = True, no_grad: Optional[List[str]] = None, limit_batches_per_epoch: Optional[int] = None, ) -> Model: serialization_dir = self.work_dir() if validation_data_loader is None: validation_data_loader = data_loader if validation_split is None: validation_loader = None else: concrete_validation_data_loader = validation_data_loader.construct( instances=dataset.splits[validation_split]) del validation_data_loader if limit_batches_per_epoch is not None: concrete_validation_data_loader = MaxBatchesDataLoader( concrete_validation_data_loader, limit_batches_per_epoch) validation_loader = DataLoaderAdapter( tango_data_loader=concrete_validation_data_loader) concrete_data_loader = data_loader.construct( instances=dataset.splits[training_split]) del data_loader if limit_batches_per_epoch is not None: concrete_data_loader = MaxBatchesDataLoader( concrete_data_loader, limit_batches_per_epoch) loader = DataLoaderAdapter(tango_data_loader=concrete_data_loader) if torch.cuda.device_count() > 0: cuda_device = torch.device(0) else: cuda_device = torch.device("cpu") check_for_gpu(cuda_device) loader.set_target_device(cuda_device) if validation_loader is not None: validation_loader.set_target_device(cuda_device) concrete_model = model.construct(vocab=dataset.vocab).to(cuda_device) del model if no_grad: for name, parameter in concrete_model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) parameters = [[n, p] for n, p in concrete_model.named_parameters() if p.requires_grad] concrete_optimizer = optimizer.construct(model_parameters=parameters) del optimizer log_frozen_and_tunable_parameter_names(concrete_model) concrete_moving_average = (None if moving_average is None else moving_average.construct( parameters=parameters)) del moving_average concrete_learning_rate_scheduler = ( None if learning_rate_scheduler is None else learning_rate_scheduler.construct( optimizer=concrete_optimizer, num_epochs=num_epochs, num_steps_per_epoch=concrete_data_loader.num_batches_per_epoch( ), )) del learning_rate_scheduler concrete_momentum_scheduler = (None if momentum_scheduler is None else momentum_scheduler.construct( optimizer=concrete_optimizer)) del momentum_scheduler if checkpointer is not None: concrete_checkpointer = checkpointer.construct( serialization_dir=serialization_dir) else: concrete_checkpointer = Checkpointer(serialization_dir) del checkpointer concrete_callbacks: List[TrainerCallback] = [ cb.construct(serialization_dir=serialization_dir) for cb in callbacks or [] ] del callbacks trainer = GradientDescentTrainer( concrete_model, optimizer=concrete_optimizer, data_loader=loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, checkpointer=concrete_checkpointer, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=concrete_learning_rate_scheduler, momentum_scheduler=concrete_momentum_scheduler, moving_average=concrete_moving_average, callbacks=concrete_callbacks, num_gradient_accumulation_steps=num_gradient_accumulation_steps, use_amp=use_amp, enable_default_callbacks=enable_default_callbacks, run_confidence_checks=run_confidence_checks, ) trainer.train() return trainer.model