def from_params(cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'GANTrainer': patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = params.pop_int("cuda_device", -1) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: scheduler = None num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) params.assert_empty(cls.__name__) return cls(model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=scheduler, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval)
def from_params(cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") num_epochs = params.pop_int("num_epochs", 20) cuda_device = params.pop_int("cuda_device", -1) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: scheduler = None num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) params.assert_empty(cls.__name__) return Trainer(model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=scheduler, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval)
def from_params(cls, model: Model, serialization_dir: str, iterator: DataIterator, iterator_aux: DataIterator, train_dataset: Dataset, train_dataset_aux: Dataset, mixing_ratio: float, cutoff_epoch: int, validation_dataset: Optional[Dataset], validation_dataset_aux: Optional[Dataset], params: Params, files_to_archive: Dict[str, str]) -> 'MultiTaskTrainer': patience = params.pop("patience", 2) validation_metric = params.pop("validation_metric", "-loss") num_epochs = params.pop("num_epochs", 20) cuda_device = params.pop("cuda_device", -1) grad_norm = params.pop("grad_norm", None) grad_clipping = params.pop("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [p for p in model.parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: scheduler = None no_tqdm = params.pop("no_tqdm", False) params.assert_empty(cls.__name__) return MultiTaskTrainer(model=model, optimizer=optimizer, iterator=iterator, iterator_aux=iterator_aux, train_dataset=train_dataset, train_dataset_aux=train_dataset_aux, mixing_ratio=mixing_ratio, cutoff_epoch=cutoff_epoch, validation_dataset=validation_dataset, validation_dataset_aux=validation_dataset_aux, patience=patience, validation_metric=validation_metric, num_epochs=num_epochs, serialization_dir=serialization_dir, files_to_archive=files_to_archive, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=scheduler, no_tqdm=no_tqdm)
def get_args(cls, model: Model, base_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], segmenter: Optional[BasePredictionClass], params: Params) -> Dict[str, Any]: patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") num_epochs = params.pop_int("num_epochs", 20) cuda_device = params.pop_int("cuda_device", -1) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) num_serialized_models_to_keep = params.pop( "num_serialized_models_to_keep", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: scheduler = None params.assert_empty(cls.__name__) kwargs = {} kwargs['model'] = model kwargs['optimizer'] = optimizer kwargs['iterator'] = iterator kwargs['train_dataset'] = train_data kwargs['validation_dataset'] = validation_data kwargs['segmenter'] = segmenter kwargs['patience'] = patience kwargs['validation_metric'] = validation_metric kwargs['num_epochs'] = num_epochs kwargs['base_dir'] = base_dir kwargs['cuda_device'] = cuda_device kwargs['grad_norm'] = grad_norm kwargs['grad_clipping'] = grad_clipping kwargs['learning_rate_scheduler'] = scheduler kwargs['num_serialized_models_to_keep'] = num_serialized_models_to_keep return kwargs
def from_params(cls, model: Model, serialization_dir: str, iterator: DataIterator, train_dataset: Dataset, validation_dataset: Optional[Dataset], params: Params) -> 'Trainer': patience = params.pop("patience", 2) validation_metric = params.pop("validation_metric", "-loss") num_epochs = params.pop("num_epochs", 20) cuda_device = params.pop("cuda_device", -1) grad_norm = params.pop("grad_norm", None) grad_clipping = params.pop("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [p for p in model.parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: scheduler = None no_tqdm = params.pop("no_tqdm", False) params.assert_empty(cls.__name__) return Trainer(model, optimizer, iterator, train_dataset, validation_dataset, patience=patience, validation_metric=validation_metric, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=scheduler, no_tqdm=no_tqdm)
def from_params(cls, model: Model, serialization_dir: str, train_iterator: DataIterator, val_iterator: DataIterator, cuda_device: int, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params) -> 'Trainer': patience = params.pop_int("patience", 2) validation_metric = params.pop("validation_metric", "-loss") num_epochs = params.pop_int("num_epochs", 20) # cuda_device = params.pop_int("cuda_device", -1) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: scheduler = None params.assert_empty(cls.__name__) return Trainer(model, optimizer, train_iterator, val_iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=scheduler)
def from_params(cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params(params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls(model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
def from_partial_objects( cls, model: Model, serialization_dir: str, data_loader: DataLoader, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, opt_level: Optional[str] = None, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) try: batches_per_epoch = len(data_loader) except TypeError: # If the dataset is lazy, it won't have a length. batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) tensorboard_writer_ = tensorboard_writer.construct( ) or TensorboardWriter(serialization_dir) return cls( model, optimizer_, data_loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, tensorboard_writer=tensorboard_writer_, checkpointer=checkpointer_, moving_average=moving_average_, batch_callbacks=batch_callbacks, epoch_callbacks=epoch_callbacks, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, opt_level=opt_level, )
def _move_to_gpu(self, model: Model) -> Model: if self.cuda_device != -1: return model.cuda(self.cuda_device) else: return model
def from_partial_objects( cls, model: Model, serialization_dir: str, data_loader: DataLoader, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, validation_metric: Union[str, List[str]] = "-loss", num_epochs: int = 20, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: float = None, grad_clipping: float = None, distributed: bool = False, world_size: int = 1, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = Lazy(Optimizer.default), learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = Lazy(Checkpointer), callbacks: List[Lazy[TrainerCallback]] = None, enable_default_callbacks: bool = True, run_sanity_checks: bool = True, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ if cuda_device is None: from torch import cuda if cuda.device_count() > 0: cuda_device = 0 else: cuda_device = -1 check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) common_util.log_frozen_and_tunable_parameter_names(model) batches_per_epoch: Optional[int] try: batches_per_epoch = len(data_loader) batches_per_epoch = math.ceil(batches_per_epoch / num_gradient_accumulation_steps) except TypeError: batches_per_epoch = None moving_average_ = ( None if moving_average is None else moving_average.construct(parameters=parameters) ) learning_rate_scheduler_ = ( None if learning_rate_scheduler is None else learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch ) ) momentum_scheduler_ = ( None if momentum_scheduler is None else momentum_scheduler.construct(optimizer=optimizer_) ) checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir) callbacks_: List[TrainerCallback] = [] for callback_ in callbacks or []: callbacks_.append(callback_.construct(serialization_dir=serialization_dir)) return cls( model, optimizer_, data_loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, moving_average=moving_average_, callbacks=callbacks_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, use_amp=use_amp, enable_default_callbacks=enable_default_callbacks, run_sanity_checks=run_sanity_checks, )
def from_partial_objects( cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_iterator: DataIterator = None, validation_data: Iterable[Instance] = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", shuffle: bool = True, num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, model_save_interval: float = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: int = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) batches_per_epoch = iterator.get_num_batches(train_data) if batches_per_epoch == 1: # get_num_batches returns 1 when it can't determine the answer batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) return cls( model, optimizer_, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, )
def from_params( cls, # type: ignore model: Model, serialization_dir: str, files_to_archive: Dict[str, str], iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'TrainerFP16': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) fp16 = params.pop_bool("fp16", False) dynamic_loss_scale = params.pop_bool("dynamic_loss_scale", True) validate_first = params.pop_bool("validate_first", False) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if fp16: model.half() if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] # If fp16, need to wrap the optimizer try: from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if fp16: # The FP16_Optimizer we use depends on whether the optimizer is FusedAdam or a regular pytorch optimizer if isinstance(optimizer, FusedAdam): from apex.optimizers import FP16_Optimizer else: from apex.fp16_utils import FP16_Optimizer optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=dynamic_loss_scale) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) statistics_interval = params.pop_int("statistics_interval", 5000) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, statistics_interval=statistics_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, fp16=fp16, validate_first=validate_first, files_to_archive=files_to_archive)
def from_partial_objects( cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_iterator: DataIterator = None, validation_data: Iterable[Instance] = None, callbacks: List[Lazy[Callback]] = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", shuffle: bool = True, num_epochs: int = 20, cuda_device: int = -1, distributed: bool = False, world_size: int = 1, optimizer: Lazy[Optimizer] = None, ) -> "CallbackTrainer": check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not callbacks: callbacks = [] else: constructed_callbacks = [] for callback in callbacks: # We only need to pass here the things that weren't already passed to # CallbackTrainer.from_partial_objects; FromParams will automatically pass those # things through to the callback constructor. callback_ = callback.construct(optimizer=optimizer, instances=train_data) constructed_callbacks.append(callback_) if distributed: rank = cuda_device else: rank = 0 return cls( model, train_data, iterator, optimizer_, num_epochs=num_epochs, shuffle=shuffle, serialization_dir=serialization_dir, cuda_device=cuda_device, callbacks=constructed_callbacks, distributed=distributed, rank=rank, world_size=world_size, )
def from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_params = params.pop("optimizer") wd = params.pop("weight_decay", 0.0) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if not isinstance(optimizer_params, str): parameter_groups = [[[ n for n, p in parameters if not any(nd in n for nd in no_decay) ], { 'weight_decay': wd }], [[ n for n, p in parameters if any(nd in n for nd in no_decay) ], { 'weight_decay': 0.0 }]] optimizer_params["parameter_groups"] = parameter_groups optimizer = Optimizer.from_params(parameters, optimizer_params) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: learning_rate_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: learning_rate_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) should_log_momentum = params.pop_bool("should_log_momentum", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler, momentum_scheduler=momentum_scheduler, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, should_log_momentum=should_log_momentum, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
def _move_to_gpu(self, model: Model) -> Model: return model.cuda(self._cuda_device[0])