def log_parameter_and_gradient_statistics(self, # pylint: disable=invalid-name model: Model, batch_grad_norm: float) -> None: """ Send the mean and std of all parameters and gradients to tensorboard, as well as logging the average gradient norm. """ if self._should_log_parameter_statistics: # Log parameter values to Tensorboard for name, param in model.named_parameters(): self.add_train_scalar("parameter_mean/" + name, param.data.mean()) if param.data.numel() > 1: self.add_train_scalar("parameter_std/" + name, param.data.std()) if param.grad is not None: if param.grad.is_sparse: # pylint: disable=protected-access grad_data = param.grad.data._values() else: grad_data = param.grad.data # skip empty gradients if torch.prod(torch.tensor(grad_data.shape)).item() > 0: # pylint: disable=not-callable self.add_train_scalar("gradient_mean/" + name, grad_data.mean()) if grad_data.numel() > 1: self.add_train_scalar("gradient_std/" + name, grad_data.std()) else: # no gradient for a parameter with sparse gradients logger.info("No gradient for %s, skipping tensorboard logging.", name) # norm of gradients if batch_grad_norm is not None: self.add_train_scalar("gradient_norm", batch_grad_norm)
def log_histograms(self, model: Model, histogram_parameters: Set[str]) -> None: """ Send histograms of parameters to tensorboard. """ for name, param in model.named_parameters(): if name in histogram_parameters: self.add_train_histogram("parameter_histogram/" + name, param)
def from_params(cls, params: Params, model: Model) -> 'UpdateMovingAverage': # type: ignore # pylint: disable=arguments-differ moving_average_params = params.pop("moving_average") model_parameters = [[name, param] for name, param in model.named_parameters() if param.requires_grad] moving_average = MovingAverage.from_params( params=moving_average_params, parameters=model_parameters) return UpdateMovingAverage(moving_average)
def log_learning_rates(self, model: Model, optimizer: torch.optim.Optimizer): """ Send current parameter specific learning rates to tensorboard """ if self._should_log_learning_rate: # optimizer stores lr info keyed by parameter tensor # we want to log with parameter name names = {param: name for name, param in model.named_parameters()} for group in optimizer.param_groups: if 'lr' not in group: continue rate = group['lr'] for param in group['params']: # check whether params has requires grad or not effective_rate = rate * float(param.requires_grad) self.add_train_scalar("learning_rate/" + names[param], effective_rate)
def from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)