def log_parameter_and_gradient_statistics(self, # pylint: disable=invalid-name model: Model, batch_grad_norm: float) -> None: """ Send the mean and std of all parameters and gradients to tensorboard, as well as logging the average gradient norm. """ if self._should_log_parameter_statistics: # Log parameter values to Tensorboard for name, param in model.named_parameters(): self.add_train_scalar("parameter_mean/" + name, param.data.mean()) if param.data.numel() > 1: self.add_train_scalar("parameter_std/" + name, param.data.std()) if param.grad is not None: if param.grad.is_sparse: # pylint: disable=protected-access grad_data = param.grad.data._values() else: grad_data = param.grad.data # skip empty gradients if torch.prod(torch.tensor(grad_data.shape)).item() > 0: # pylint: disable=not-callable self.add_train_scalar("gradient_mean/" + name, grad_data.mean()) if grad_data.numel() > 1: self.add_train_scalar("gradient_std/" + name, grad_data.std()) else: # no gradient for a parameter with sparse gradients logger.info("No gradient for %s, skipping tensorboard logging.", name) # norm of gradients if batch_grad_norm is not None: self.add_train_scalar("gradient_norm", batch_grad_norm)
def log_histograms(self, model: Model, histogram_parameters: Set[str]) -> None: """ Send histograms of parameters to tensorboard. """ for name, param in model.named_parameters(): if name in histogram_parameters: self.add_train_histogram("parameter_histogram/" + name, param)
def enable_gradient_clipping(model: Model, grad_clipping: Optional[float]) -> None: if grad_clipping is not None: for parameter in model.parameters(): if parameter.requires_grad: parameter.register_hook(lambda grad: nn_util.clamp_tensor( grad, minimum=-grad_clipping, maximum=grad_clipping))
def from_params(cls, params: Params, model: Model) -> 'UpdateMovingAverage': # type: ignore # pylint: disable=arguments-differ moving_average_params = params.pop("moving_average") model_parameters = [[name, param] for name, param in model.named_parameters() if param.requires_grad] moving_average = MovingAverage.from_params( params=moving_average_params, parameters=model_parameters) return UpdateMovingAverage(moving_average)
def rescale_gradients(model: Model, grad_norm: Optional[float] = None) -> Optional[float]: """ Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled. """ if grad_norm: parameters_to_clip = [ p for p in model.parameters() if p.grad is not None ] return sparse_clip_norm(parameters_to_clip, grad_norm) return None
def get_metrics(model: Model, total_loss: float, num_batches: int, reset: bool = False) -> Dict[str, float]: """ Gets the metrics but sets ``"loss"`` to the total loss divided by the ``num_batches`` so that the ``"loss"`` metric is "average loss per batch". """ metrics = model.get_metrics(reset=reset) metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0 return metrics
def enable_activation_logging(self, model: Model) -> None: if self._histogram_interval is not None: # To log activation histograms to the forward pass, we register # a hook on forward to capture the output tensors. # This uses a closure to determine whether to log the activations, # since we don't want them on every call. for _, module in model.named_modules(): if not getattr(module, 'should_log_activations', False): # skip it continue def hook(module_, inputs, outputs): # pylint: disable=unused-argument,cell-var-from-loop log_prefix = 'activation_histogram/{0}'.format(module_.__class__) if self.should_log_histograms_this_batch(): self.log_activation_histogram(outputs, log_prefix) module.register_forward_hook(hook)
def log_learning_rates(self, model: Model, optimizer: torch.optim.Optimizer): """ Send current parameter specific learning rates to tensorboard """ if self._should_log_learning_rate: # optimizer stores lr info keyed by parameter tensor # we want to log with parameter name names = {param: name for name, param in model.named_parameters()} for group in optimizer.param_groups: if 'lr' not in group: continue rate = group['lr'] for param in group['params']: # check whether params has requires grad or not effective_rate = rate * float(param.requires_grad) self.add_train_scalar("learning_rate/" + names[param], effective_rate)
def from_params(params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'TrainerPieces': all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params(params.pop( "vocabulary", {}), (instance for key, dataset in all_datasets.items() if key in datasets_for_vocab_creation for instance in dataset)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return TrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
def load_archive( archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None ) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}") if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}") with tarfile.open(resolved_archive_file, "r:gz") as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, "r") as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, original_filename in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") if os.path.exists(replacement_filename): replacements_dict[key] = replacement_filename else: logger.warning( f"Archived file {replacement_filename} not found! At train time " f"this file was located at {original_filename}. This may be " "because you are loading a serialization directory. Attempting to " "load the file from its train-time location." ) overrides_dict = parse_overrides(overrides) combined_dict = with_fallback( preferred=overrides_dict, fallback=unflatten(replacements_dict) ) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load( config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device, ) return Archive(model=model, config=config)
def _move_to_gpu(self, model: Model) -> Model: if self._cuda_devices[0] != -1: return model.cuda(self._cuda_devices[0]) else: return model
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics