def validate(self, trainer: 'CallbackTrainer'): # If the trainer has MovingAverage objects, use their weights for validation. for moving_average in self.moving_averages: moving_average.assign_average_value() with torch.no_grad(): # We have a validation set, so compute all the metrics on it. logger.info("Validating") trainer.model.eval() num_gpus = len(trainer._cuda_devices) # pylint: disable=protected-access raw_val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( self.iterator.get_num_batches(self.instances) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = trainer.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() if self.loss_tracker is not None: ''' update validation regular / irregular loss status ''' if trainer.model._effective_encoder is trainer.model._sketch_encoder: self.loss_tracker.cumulated_regular_loss += loss.detach( ).cpu().numpy() self.loss_tracker.regular_batch_count += 1. else: self.loss_tracker.cumulated_irregular_loss += loss.detach( ).cpu().numpy() self.loss_tracker.irregular_batch_count += 1. # Update the description with the latest metrics val_metrics = training_util.get_metrics( trainer.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) trainer.val_metrics = training_util.get_metrics(trainer.model, val_loss, batches_this_epoch, reset=True) # If the trainer has a moving average, restore for moving_average in self.moving_averages: moving_average.restore()
def train_one_batch_group(self, batch_group: List[TensorDict]) -> str: """ Handles the training for a single batch group. Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END. """ self.handler.fire_event(Events.BATCH_START) self.optimizer.zero_grad() self.batches_this_epoch += 1 self.batch_num_total += 1 self.handler.fire_event(Events.FORWARD) loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() self.train_loss += loss.item() self.handler.fire_event(Events.BACKWARD) self.optimizer.step() # Update the description with the latest metrics self.train_metrics = training_util.get_metrics(self.model, self.train_loss, self.batches_this_epoch) self.handler.fire_event(Events.BATCH_END) return training_util.description_from_metrics(self.train_metrics)
def get_desc_from_metrics(self, metrics, epoch=None): description = training_util.description_from_metrics(metrics) if epoch is None: description = f'epoch: -- rank: {dist.get_rank()} || {description}' else: description = f'epoch: {epoch} rank: {dist.get_rank()} || {description}' return description
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( val_iterator.get_num_batches(self._validation_data) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) print("val gene called") batches_this_epoch = 0 val_loss = 0 try: few_shot = val_generator.__next__() except: print("Error could not do few shot validation") return batches_this_epoch, val_loss self.reptile_inner_update(few_shot) self.model.eval() with torch.no_grad(): for batch_group in val_generator_tqdm: loss = self.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def evaluate(model: Model, instances: Iterable[Instance], task_name: str, data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: """ Evaluate a model for a particular tasks (usually after training). Parameters ---------- model : ``allennlp.models.model.Model``, required The model to evaluate instances : ``Iterable[Instance]``, required The (usually test) dataset on which to evalute the model. task_name : ``str``, required The name of the tasks on which evaluate the model. data_iterator : ``DataIterator`` Iterator that go through the dataset. cuda_device : ``int`` Cuda device to use. Returns ------- metrics : ``Dict[str, Any]`` A dictionary containing the metrics on the evaluated dataset. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) eval_loss = 0 nb_batches = 0 for tensor_batch in generator_tqdm: nb_batches += 1 train_stages = ["stm", "sd", "valid"] task_index = TASKS_NAME.index(task_name) tensor_batch['task_index'] = torch.tensor(task_index) tensor_batch["reverse"] = torch.tensor(False) tensor_batch['for_training'] = torch.tensor(False) train_stage = train_stages.index("stm") tensor_batch['train_stage'] = torch.tensor(train_stage) tensor_batch = move_to_device(tensor_batch, 0) eval_output_dict = model.forward(**tensor_batch) loss = eval_output_dict["loss"] eval_loss += loss.item() metrics = model.get_metrics(task_name=task_name) metrics["stm_loss"] = float(eval_loss / nb_batches) description = training_util.description_from_metrics(metrics) generator_tqdm.set_description(description, refresh=False) metrics = model.get_metrics(task_name=task_name, reset=True) metrics["stm_loss"] = float(eval_loss / nb_batches) return metrics
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) num_validation_batches = val_iterator.get_num_batches( self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: loss = self.batch_loss(batch, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def __call__(self, trainer: GradientDescentTrainer, metrics: Dict[str, Any], epoch: int, **kwargs): trainer.model.get_metrics(True) if epoch < 0: return # for moving_average in self.moving_averages: # moving_average.assign_average_value() with torch.no_grad(): logger.info("Testing") trainer.model.eval() batches_this_epoch = 0 val_loss = 0 bar = tqdm(self.test_iterator, desc="testing") for batch_group in bar: outs = trainer.batch_outputs(batch_group, for_training=False) loss = outs["loss"] if self.writer is not None: self.writer.write(outs) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = get_metrics(trainer.model, val_loss, val_loss, batches_this_epoch) description = description_from_metrics(val_metrics) if self.name is not None: description = "epoch: %d, dataset: %s, %s" % ( epoch, self.name, description) bar.set_description(description, refresh=False) trainer.val_metrics = get_metrics(trainer.model, val_loss, val_loss, batches_this_epoch, reset=False) if self.wandb_logger is not None: self.wandb_logger(trainer.val_metrics, epoch, prefix=self.name) # If the trainer has a moving average, restore # for moving_average in self.moving_averages: # moving_average.restore() if self.writer is not None: self.writer.set_epoch(epoch + 1) self.writer.reset() trainer.model.get_metrics(True)
def train_one_batch_group(self, batch_group): # Each batch_group should have only one batch batch, = batch_group array = batch["array"] # We should not have mixed batches: if len(set(batch["stage"])) != 1: raise ValueError("mixed batch") stage = batch["stage"][0] self.optimizer.stage = stage self.optimizer.zero_grad() if stage == "discriminator_real": # Generate real data and expect the discriminator to predict 1. output = self.model.discriminator(array, torch.ones(1)) loss = output["loss"] self.discriminator_real_loss += loss.sum().item() elif stage == "discriminator_fake": # Generate fake data and expect the discriminator to predict 0. fake_data = self.model.generator(array) output = self.model.discriminator(fake_data["output"], torch.zeros(1)) loss = output["loss"] self.discriminator_fake_loss += loss.sum().item() elif stage == "generator": # Generate fake data and try to fool the discriminator. generated = self.model.generator(array, self.model.discriminator) fake_data = generated["output"] loss = generated["loss"] self.generator_loss += loss.sum().item() self.fake_mean += fake_data.mean() self.fake_stdev += fake_data.std() self.count += 1 self.train_loss += loss.sum().item() loss.backward() count = max(self.count, 1) self.train_metrics = { "gl": self.generator_loss / count, "dfl": self.discriminator_fake_loss / count, "drl": self.discriminator_real_loss / count, "mean": self.fake_mean / count, "stdev": self.fake_stdev / count } self.optimizer.step() return training_util.description_from_metrics(self.train_metrics)
def validate(self, trainer: "CallbackTrainer"): # If the trainer has MovingAverage objects, use their weights for validation. for moving_average in self.moving_averages: moving_average.assign_average_value() with torch.no_grad(): # We have a validation set, so compute all the metrics on it. logger.info("Validating") trainer.model.eval() val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False) num_validation_batches = self.iterator.get_num_batches( self.instances) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: loss = trainer.batch_loss(batch, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( trainer.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) trainer.val_metrics = training_util.get_metrics(trainer.model, val_loss, batches_this_epoch, reset=True) # If the trainer has a moving average, restore for moving_average in self.moving_averages: moving_average.restore()
def _validation_loss(self) -> Tuple[float, int]: logger.info("Validating") self.model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) if getattr(self, "val_dataset", None) is None: self.val_dataset = DMDataSet(data=self._validation_data[0], batch_size=self.batch_size, num_gpus=num_gpus, shuffle=False) num_validation_batches = math.ceil( len(self.val_dataset) / self.batch_size / num_gpus) val_generator_tqdm = Tqdm.tqdm(self.val_dataset, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = self.batch_loss(batch_group, for_training=False) if loss is not None: batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self.model.eval() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( val_iterator.get_num_batches(self._validation_data) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = self.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) return val_loss, batches_this_epoch
def validate(self, trainer: 'CallbackTrainer'): with torch.no_grad(): # We have a validation set, so compute all the metrics on it. logger.info("Validating") trainer.model.eval() num_gpus = len(trainer._cuda_devices) # pylint: disable=protected-access raw_val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( self.iterator.get_num_batches(self.instances) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = trainer.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( trainer.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) trainer.val_metrics = training_util.get_metrics(trainer.model, val_loss, batches_this_epoch, reset=True)
def train_one_batch_group(self, batch_group: List[TensorDict]) -> str: """ Handles the training for a single batch group. Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END. """ self.handler.fire_event(Events.BATCH_START) self.optimizer.zero_grad() self.batches_this_epoch += 1 self.batch_num_total += 1 self.handler.fire_event(Events.FORWARD) loss = self.batch_loss(batch_group, for_training=True) if loss is None: return if torch.isnan(loss): logger.warning("NaN loss encountered.") return if self._use_apex: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.train_loss += loss.item() self.handler.fire_event(Events.BACKWARD) self.optimizer.step() # Update the description with the latest metrics self.train_metrics = training_util.get_metrics(self.model, self.train_loss, self.batches_this_epoch) self.handler.fire_event(Events.BATCH_END) return training_util.description_from_metrics(self.train_metrics)
def predict(self, dataset: Iterable[Instance]) -> None: self.init_confusion_matrices() pred_generator = self.iterator(dataset, num_epochs=1, shuffle=False) pred_generator_tqdm = tqdm( pred_generator, total=self.iterator.get_num_batches(dataset)) self.model.eval() with torch.no_grad(): batches_this_epoch = 0 pred_loss = 0 for batch in pred_generator_tqdm: batch = nn_util.move_to_device(batch, self.cuda_device) output_dict = self.model(**batch) predictions = output_dict["forward_logits"] gold_labels = batch.get("forward_output_tokens").get("tokens") self.update_confusion_matrices(predictions, gold_labels) loss = output_dict["loss"] if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 pred_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics pred_metrics = training_util.get_metrics( self.model, pred_loss, batches_this_epoch) description = training_util.description_from_metrics( pred_metrics) pred_generator_tqdm.set_description(description, refresh=False) return pred_metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 train_reg_loss = None if regularization_penalty is None else 0.0 batch_reg_loss = None if regularization_penalty is None else 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps ) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's # progress is shown if self._primary: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if done_early: break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # Zero gradients. # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()` # because it avoids a read op when the gradients are first updated below. for param_group in self.optimizer.param_groups: for p in param_group["params"]: p.grad = None batch_loss = 0.0 batch_group_outputs = [] for batch in batch_group: with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss += loss.item() if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss # type: ignore if self._scaler is not None: self._scaler.scale(loss).backward() else: loss.backward() if len(batch_group_outputs) <= 0: continue train_loss += batch_loss batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if batch_num_total % self.val_loss_steps == 0: logger.info("%s: %.4f" % ('train_loss', train_loss / batches_this_epoch)) if self._validation_data_loader is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, val_reg_loss, num_batches = self._validation_loss_n_step(batch_num_total) val_metrics = training_util.get_metrics( self.model, val_loss, val_reg_loss, num_batches=num_batches, batch_loss=None, batch_reg_loss=None, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) # description = training_util.description_from_metrics(val_metrics) logger.info("%s: %.4f" % ('val_loss', val_loss / num_batches)) # batch_group_generator_tqdm.set_description(description, refresh=False) self._pytorch_model.train() if self._primary: # Updating tqdm only for the primary as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) if self._checkpointer is not None: self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._callbacks: callback.on_batch( self, batch_group, batch_group_outputs, metrics, epoch, batches_this_epoch, is_training=True, is_primary=self._primary, batch_grad_norm=batch_grad_norm, ) metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024) for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics
def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self.model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_data_loader is not None: validation_data_loader = self._validation_data_loader else: raise ConfigurationError( "Validation results cannot be calculated without a validation_data_loader" ) val_generator_tqdm = Tqdm.tqdm(validation_data_loader) for component_optimizer in self.component_optimizers.values(): component_optimizer.reset_loss('validation') batches_this_epoch = 0 done_early = False for batch in val_generator_tqdm: batches_this_epoch += 1 batch_metrics = [] batch_group = [batch] meta_batch = deepcopy(batch_group) # Train the Sub Models first for name, sub_model in self._pytorch_model.component_models.items( ): component_optimizer = self.component_optimizers[name] batch_group_outputs, metrics = component_optimizer.process_batch_group( batch_group, for_training=False, batches_this_epoch=batches_this_epoch) batch_metrics.append(metrics) for i, batch_outputs in enumerate(batch_group_outputs): meta_batch[i][name] = batch_outputs["output"] meta_optimizer = self.component_optimizers["meta"] meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group( meta_batch, for_training=False, batches_this_epoch=batches_this_epoch) batch_metrics.append(meta_metrics) all_metrics = ChainMap(*batch_metrics) description = training_util.description_from_metrics(all_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return all_metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_loss_lang1 = 0.0 train_loss_lang2 = 0.0 train_loss_cm = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() self.optimizer_lang1.zero_grad() self.optimizer_lang2.zero_grad() self.optimizer_cm.zero_grad() loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss( batch_group, for_training=True) if torch.isnan(loss): # if either on of loss_%s is nan, loss will be nan raise ValueError("nan loss encountered") ####### # lang1 ####### loss_lang1.backward() train_loss_lang1 += loss_lang1.item() self.rescale_gradients() if self._learning_rate_scheduler_lang1: self._learning_rate_scheduler_lang1.step_batch(batch_num_total) if self._momentum_scheduler_lang1: self._momentum_scheduler_lang1.step_batch(batch_num_total) self.optimizer_lang1.step() self.optimizer_lang1.zero_grad() ####### # cm ####### loss_lang2.backward() train_loss_lang2 += loss_lang2.item() batch_grad_norm = self.rescale_gradients() if self._learning_rate_scheduler_lang2: self._learning_rate_scheduler_lang2.step_batch(batch_num_total) if self._momentum_scheduler_lang2: self._momentum_scheduler_lang2.step_batch(batch_num_total) self.optimizer_lang2.step() self.optimizer_lang2.zero_grad() ####### # lang2 ####### loss_cm.backward() train_loss_cm += loss_cm.item() self.rescale_gradients() if self._learning_rate_scheduler_cm: self._learning_rate_scheduler_cm.step_batch(batch_num_total) if self._momentum_scheduler_cm: self._momentum_scheduler_cm.step_batch(batch_num_total) self.optimizer_cm.step() self.optimizer_cm.zero_grad() train_loss += loss.item() # Update the description with the latest metrics # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) metrics = self.model.get_metrics(False) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang1) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang2) self._tensorboard.log_learning_rates(self.model, self.optimizer_cm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.add_train_scalar("loss/cm_loss_train", metrics["cm_loss"]) self._tensorboard.add_train_scalar("loss/lang1_loss_train", metrics["lang1_loss"]) self._tensorboard.add_train_scalar("loss/lang2_loss_train", metrics["lang2_loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics = self.model.get_metrics(reset=True) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 train_reg_loss = None if regularization_penalty is None else 0.0 batch_reg_loss = None if regularization_penalty is None else 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps ) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's # progress is shown if self._primary: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if done_early: break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # Zero gradients. # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()` # because it avoids a read op when the gradients are first updated below. for param_group in self.optimizer.param_groups: for p in param_group["params"]: p.grad = None batch_loss = 0.0 batch_group_outputs = [] for batch in batch_group: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced." ) break with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss += loss.item() if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss # type: ignore if self._scaler is not None: self._scaler.scale(loss).backward() else: loss.backward() if len(batch_group_outputs) <= 0: continue train_loss += batch_loss batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if self._primary: # Updating tqdm only for the primary as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) if self._checkpointer is not None: self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._callbacks: callback.on_batch( self, batch_group, batch_group_outputs, metrics, epoch, batches_this_epoch, is_training=True, is_primary=self._primary, batch_grad_norm=batch_grad_norm, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024) for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics
def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_data_loader is not None: validation_data_loader = self._validation_data_loader else: raise ConfigurationError( "Validation results cannot be calculated without a validation_data_loader" ) val_generator_tqdm = Tqdm.tqdm(validation_data_loader) batches_this_epoch = 0 val_loss = 0 val_reg_loss = 0 done_early = False for batch in val_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing validation early! " "This implies that there is an imbalance in your validation " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batch_outputs = self.batch_outputs(batch, for_training=False) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() if reg_loss is not None: val_reg_loss += reg_loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, val_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) if self._master: for callback in self._batch_callbacks: callback( self, [batch], [batch_outputs], epoch, batches_this_epoch, is_training=False, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)." ) # Indicate that we're done so that any workers that have remaining data stop validation early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, val_reg_loss, batches_this_epoch
def _validation(self, n_epoch: int) -> Tuple[float, int]: ### Begin validation of the model ### logger.info("Validation - Begin") all_val_metrics = {} self._model.eval() # Set the model into evaluation mode avg_accuracy = 0.0 for task_idx, task in enumerate(self._task_list): logger.info("Validation - Task {}/{}: {}", task_idx + 1, self._n_tasks, task._name) val_loss = 0.0 n_batches_val_this_epoch_this_task = 0 n_val_batches = self._task_infos[task._name]["n_val_batches"] scheduler = self._schedulers[task._name] # Create tqdm generator for current tasks's validation data_iterator = task._data_iterator val_generator = data_iterator(task._validation_data, num_epochs=1, shuffle=False) val_generator_tqdm = tqdm.tqdm(val_generator, total=n_val_batches) # Iterate over each validation batch for this tasks for batch in val_generator_tqdm: n_batches_val_this_epoch_this_task += 1 # Get the loss val_output_dict = self._forward(batch, task=task, for_training=False) loss = val_output_dict["stm_loss"] val_loss += loss.item() del loss # Get metrics for all progress so far, update tqdm, display description task_metrics = self._get_metrics(task=task) task_metrics["loss"] = float(val_loss / n_batches_val_this_epoch_this_task) description = training_util.description_from_metrics(task_metrics) val_generator_tqdm.set_description(description) # Get tasks validation metrics and store them in all_val_metrics task_metrics = self._get_metrics(task=task, reset=True) if task._name not in all_val_metrics: all_val_metrics[task._name] = {} for name, value in task_metrics.items(): all_val_metrics[task._name][name] = value all_val_metrics[task._name]["loss"] = float(val_loss / n_batches_val_this_epoch_this_task) avg_accuracy += task_metrics["sentiment_acc"] # Tensorboard - Validation metrics for this epoch for metric_name, value in all_val_metrics[task._name].items(): self._tensorboard.add_validation_scalar( name="task_" + task._name + "/" + metric_name, value=value ) ### Perform a patience check and update the history of validation metric for this tasks ### this_epoch_val_metric = all_val_metrics[task._name][task._val_metric] metric_history = self._metric_infos[task._name]["hist"] metric_history.append(this_epoch_val_metric) is_best_so_far, out_of_patience = self._check_history( metric_history=metric_history, cur_score=this_epoch_val_metric, should_decrease=task._val_metric_decreases, ) if is_best_so_far: logger.info("Best model found for {}.", task._name) self._metric_infos[task._name]["best"] = (n_epoch, all_val_metrics) if out_of_patience and not self._metric_infos[task._name]["is_out_of_patience"]: self._metric_infos[task._name]["is_out_of_patience"] = True logger.info("Task {} is out of patience and vote to stop the training.", task._name) # The LRScheduler API is agnostic to whether your schedule requires a validation metric - # if it doesn't, the validation metric passed here is ignored. scheduler.step(this_epoch_val_metric, n_epoch) logger.info("Validation - End") return all_val_metrics, avg_accuracy
def _train_epoch(self, total_n_tr_batches: int, sampling_prob: List, reverse=False, train_D=False) -> Dict[ str, float]: self._model.train() # Set the model to "train" mode. if reverse: logger.info("Training Generator- Begin") elif not train_D: logger.info("Training Init Generator- Begin") if train_D: logger.info("Training Discriminator- Begin") logger.info("reverse is {}, train_D is {}", reverse, train_D) ### Reset training and trained batches counter before new training epoch ### for _, task_info in self._task_infos.items(): task_info["tr_loss_cum"] = 0.0 task_info['stm_loss'] = 0.0 task_info['p_d_loss'] = 0.0 task_info['s_d_loss'] = 0.0 task_info['valid_loss'] = 0.0 task_info["n_batches_trained_this_epoch"] = 0 all_tr_metrics = {} # BUG TO COMPLETE COMMENT TO MAKE IT MORE CLEAR ### Start training epoch ### epoch_tqdm = tqdm.tqdm(range(total_n_tr_batches), total=total_n_tr_batches) histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) for step, _ in enumerate(epoch_tqdm): task_idx = np.argmax(np.random.multinomial(1, sampling_prob)) task = self._task_list[task_idx] task_info = self._task_infos[task._name] ### One forward + backward pass ### # Call next batch to train batch = next(self._tr_generators[task._name]) self._batch_num_total += 1 task_info["n_batches_trained_this_epoch"] += 1 # Load optimizer if not train_D: optimizer = self._optimizers[task._name]["all_params"] else: optimizer = self._optimizers[task._name]["exclude_share_encoder"] # Get the loss for this batch output_dict = self._forward(tensor_batch=batch, task=task, for_training=True, reverse=reverse) # if reverse or train_D: # output_dict_fake = self._forward(tensor_batch=batch, task=task, for_training=True, reverse=True) # loss = output_dict["stm_loss"] # if train_D: # loss = (output_dict["stm_loss"] + output_dict["s_d_loss"] + output_dict_fake["stm_loss"] + # output_dict_fake["s_d_loss"]) / 2.0 # if reverse: # # loss = (output_dict["stm_loss"] + output_dict["p_d_loss"] + 0.005 * output_dict["s_d_loss"] + # # output_dict_fake["stm_loss"] + output_dict_fake["p_d_loss"] + 0.005 * output_dict_fake[ # # "s_d_loss"]) / 2.0 # loss = (output_dict['loss'] + output_dict_fake['loss']) / 2.0 loss = output_dict['loss'] if self._gradient_accumulation_steps > 1: loss /= self._gradient_accumulation_steps loss.backward() task_info["tr_loss_cum"] += loss.item() task_info['stm_loss'] += output_dict['stm_loss'].item() task_info['p_d_loss'] += output_dict['p_d_loss'].item() task_info['s_d_loss'] += output_dict['s_d_loss'].item() task_info['valid_loss'] += output_dict['valid_loss'].item() # if reverse or train_D: # task_info['stm_loss'] += output_dict_fake['stm_loss'].item() # task_info['stm_loss'] /= 2.0 # task_info['p_d_loss'] += output_dict_fake['p_d_loss'].item() # task_info['p_d_loss'] /= 2.0 # task_info['s_d_loss'] += output_dict_fake['s_d_loss'].item() # task_info['s_d_loss'] /= 2.0 # task_info['valid_loss'] += output_dict_fake['valid_loss'].item() # task_info['valid_loss'] /= 2.0 del loss if (step + 1) % self._gradient_accumulation_steps == 0: batch_grad_norm = self._rescale_gradients() if self._tensorboard.should_log_histograms_this_batch(): param_updates = {name: param.detach().cpu().clone() for name, param in self._model.named_parameters()} optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: optimizer.step() optimizer.zero_grad() ### Get metrics for all progress so far, update tqdm, display description ### task_metrics = self._get_metrics(task=task) task_metrics["loss"] = float( task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001) ) task_metrics["stm_loss"] = float( task_info["stm_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001) ) task_metrics["p_d_loss"] = float( task_info["p_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001) ) task_metrics["s_d_loss"] = float( task_info["s_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001) ) task_metrics["valid_loss"] = float( task_info["valid_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001) ) description = training_util.description_from_metrics(task_metrics) epoch_tqdm.set_description(task._name + ", " + description) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics(self._model, batch_grad_norm) self._tensorboard.log_learning_rates(self._model, optimizer) self._tensorboard.log_metrics( {"epoch_metrics/" + task._name + "/" + k: v for k, v in task_metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self._model, histogram_parameters) self._global_step += 1 ### Bookkeeping all the training metrics for all the tasks on the training epoch that just finished ### for task in self._task_list: task_info = self._task_infos[task._name] task_info["total_n_batches_trained"] += task_info["n_batches_trained_this_epoch"] task_info["last_log"] = time.time() task_metrics = self._get_metrics(task=task, reset=True) if task._name not in all_tr_metrics: all_tr_metrics[task._name] = {} for name, value in task_metrics.items(): all_tr_metrics[task._name][name] = value all_tr_metrics[task._name]["loss"] = float( task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01) ) all_tr_metrics[task._name]["stm_loss"] = float( task_info["stm_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01) ) all_tr_metrics[task._name]["p_d_loss"] = float( task_info["p_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01) ) all_tr_metrics[task._name]["s_d_loss"] = float( task_info["s_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01) ) all_tr_metrics[task._name]["valid_loss"] = float( task_info["valid_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01) ) # Tensorboard - Training metrics for this epoch for metric_name, value in all_tr_metrics[task._name].items(): self._tensorboard.add_train_scalar( name="task_" + task._name + "/" + metric_name, value=value ) logger.info("Train - End") return all_tr_metrics
def semi_train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.trainer.model.train() num_gpus = len(self.trainer._cuda_devices) self.trainer._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self.trainer._batch_num_total is None: self.trainer._batch_num_total = 0 histogram_parameters = set( self.trainer.model. get_parameters_for_histogram_tensorboard_logging()) #Pdb().set_trace() mixed_generator, num_training_batches = get_mixer( self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled) #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer) #generator for lambda update mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm', 1.0) #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm') logger.info("Training") train_generator_tqdm = Tqdm.tqdm(mixed_generator, total=num_training_batches) #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator), # total=num_training_batches) cumulative_batch_size = 0 unlabelled_loss = 0 unlabelled_batches_this_epoch = 0 batches_since_last_step = 0 agg_loss = 0.0 flag = False batch_grad_norm = None for batch_group, group_id in train_generator_tqdm: #print(batch_group[0]['sentence']['tokens'].shape) if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id: continue output_dict = self.batch_loss( batch_group, for_training=True, eval_metric=(group_id == self.labelled_id)) penalties = defaultdict(float) if self.constraints_model is not None: penalties = self.constraints_model( output_dict['task1_tag_logits'], output_dict['task2_tag_logits'], output_dict['mask']) loss = 0.0 if 'loss' in output_dict: loss = output_dict['loss'] train_loss += loss.item() loss += output_dict.get('regularization_penalty', 0.0) loss += self.constraints_wt * penalties['loss'] unlabelled_loss += penalties['loss'].item() if torch.is_tensor( penalties['loss']) else penalties['loss'] agg_loss += loss batches_since_last_step += 1 if batches_since_last_step == self.backprop_after_xbatches: #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False else: flag = True #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) if (group_id != self.labelled_id): unlabelled_batches_this_epoch += 1 #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() #self.trainer.optimizer.step() else: self.total_supervised_iters += 1.0 batches_this_epoch += 1 self.trainer._batch_num_total += 1 batch_num_total = self.trainer._batch_num_total #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self.trainer._learning_rate_scheduler: self.trainer._learning_rate_scheduler.step_batch( batch_num_total) if self.trainer._tensorboard.should_log_histograms_this_batch( ): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.trainer.model.named_parameters() } #self.trainer.optimizer.step() for name, param in self.trainer.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self.trainer._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: pass #self.trainer.optimizer.step() # Update moving averages if self.trainer._moving_average is not None: self.trainer._moving_average.apply(batch_num_total) # metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch) metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) # Update the description with the latest metrics description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self.trainer._tensorboard.should_log_this_batch( ) and batch_grad_norm is not None: self.trainer._tensorboard.log_parameter_and_gradient_statistics( self.trainer.model, batch_grad_norm) self.trainer._tensorboard.log_learning_rates( self.trainer.model, self.trainer.optimizer) self.trainer._tensorboard.add_train_scalar( "loss/loss_train", metrics["loss"]) self.trainer._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.trainer._tensorboard.should_log_histograms_this_batch( ): self.trainer._tensorboard.log_histograms( self.trainer.model, histogram_parameters) if self.trainer._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self.trainer._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self.trainer._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self.trainer._tensorboard.add_train_scalar( "mean_batch_size", average) # Save model if needed. if self.trainer._model_save_interval is not None and ( time.time() - last_save_time > self.trainer._model_save_interval): last_save_time = time.time() self.trainer._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters ) and (self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by if flag: batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters) and ( self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage metrics['lb'] = batches_this_epoch metrics['ub'] = unlabelled_batches_this_epoch metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) if self.constraints_model is not None: lambda_stats_dict = self.constraints_model.lambda_stats() metrics.update(lambda_stats_dict) for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self.model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( val_iterator.get_num_batches(self._validation_data) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: images = [] text = [] segment_ids = [] labels = [] num_negative_samples = self.num_negative_samples * 10 for i in range(len(batch_group[0]['images'])): positive_index = random.randint(0, num_negative_samples) labels.append(positive_index) if self.retrieve_text: instance_text = [] instance_segment_ids = [] for j in range(num_negative_samples + 1): if j == positive_index: instance_text.append(batch_group[0]['token_ids'] ['tokens'][i].tolist()) instance_segment_ids.append( batch_group[0]['segment_ids'][i].tolist()) else: negative_sample_index = random.choice( self.val_indices) text_field = TextField( self.val_text_db[negative_sample_index], self.val_token_indexers) text_field.index(self.model.vocab) padding_lengths = text_field.get_padding_lengths() instance_text.append( text_field.as_tensor( padding_lengths=padding_lengths) ['tokens'].tolist()) instance_segment_ids.append( self.val_segment_ids_db[negative_sample_index]. tolist()) text += instance_text segment_ids += instance_segment_ids else: instance_images = [ None for _ in range(num_negative_samples + 1) ] for j in range(num_negative_samples + 1): if j == positive_index: instance_images[j] = np.expand_dims( batch_group[0]['images'][i].numpy(), 0) else: instance_images[j] = np.expand_dims( random.choice(self.val_image_db), 0) images += instance_images matching_label_field_name = "labels" if self.retrieve_text: max_text_len = max([len(sequence) for sequence in text]) text = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in text ] batch_group[0]['token_ids'] = { 'tokens': torch.LongTensor(text) } segment_ids = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in segment_ids ] batch_group[0]['segment_ids'] = torch.from_numpy( np.array(segment_ids, dtype=np.int64)) else: batch_group[0]['images'] = torch.from_numpy(np.vstack(images)) batch_group[0][matching_label_field_name] = torch.from_numpy( np.array(labels, dtype=np.int64)) loss = self.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps ) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() self._writer.log({"step_loss": loss.item()}, step=self._batch_num_total) train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._writer.log({"lr": self.optimizer.param_groups[0]['lr']}, step=self._batch_num_total) # Save model if needed. if ( self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master ): last_save_time = time.time() self._save_checkpoint( "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time))) ) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_reg_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches = math.ceil( len(self.data_loader) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs["reg_loss"] if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) reg_loss = reg_loss / len(batch_group) if self._opt_level is not None: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_reg_loss += reg_loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch(self.model, self.optimizer, batch_grad_norm, metrics, batch_group, param_updates) if self._master: self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() images = [] text = [] segment_ids = [] labels = [] for i in range(len(batch_group[0]['images'])): positive_index = random.randint(0, self.num_negative_samples) labels.append(positive_index) if self.retrieve_text: instance_text = [] instance_segment_ids = [] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_text.append(batch_group[0]['token_ids'] ['tokens'][i, :].tolist()) instance_segment_ids.append( batch_group[0]['segment_ids'][i].tolist()) else: negative_sample_index = random.choice( self.train_indices) text_field = TextField( self.train_text_db[negative_sample_index], self.train_token_indexers) text_field.index(self.model.vocab) padding_lengths = text_field.get_padding_lengths() instance_text.append( text_field.as_tensor( padding_lengths=padding_lengths) ['tokens'].tolist()) instance_segment_ids.append( self.train_segment_ids_db[ negative_sample_index].tolist()) text += instance_text segment_ids += instance_segment_ids else: instance_images = [ None for _ in range(self.num_negative_samples + 1) ] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_images[j] = np.expand_dims( batch_group[0]['images'][i].numpy(), 0) else: instance_images[j] = np.expand_dims( random.choice(self.train_image_db), 0) images += instance_images matching_label_field_name = "labels" if self.retrieve_text: max_text_len = max([len(sequence) for sequence in text]) text = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in text ] batch_group[0]['token_ids'] = { 'tokens': torch.LongTensor(text) } segment_ids = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in segment_ids ] batch_group[0]['segment_ids'] = torch.from_numpy( np.array(segment_ids, dtype=np.int64)) else: batch_group[0]['images'] = torch.from_numpy(np.vstack(images)) batch_group[0][matching_label_field_name] = torch.from_numpy( np.array(labels, dtype=np.int64)) loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: self.model.train() batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self.model.named_parameters()} self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size/batches_this_epoch logger.info(f"current batch size: {cur_batch} mean batch size: {average}") self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time))) ) if self._early_stopping_by_batch and self._batch_num_total % 10 == 0: if self._validation_data is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, num_batches = self._validation_loss() val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True) # Check validation metric for early stopping this_epoch_val_metric = val_metrics[self._validation_metric] self._metric_tracker.add_metric(this_epoch_val_metric) if self._metric_tracker.is_best_so_far(): metrics['best_batch'] = self._batch_num_total for key, value in val_metrics.items(): metrics["best_validation_" + key] = value self._metric_tracker.best_epoch_metrics = val_metrics self._save_checkpoint(self._batch_num_total) if self.callbacks is not None: for callback in self.callbacks: callback.on_batch_end(self._batch_num_total) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_memory_mb().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage MB: {memory}") gpu_memory_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 if regularization_penalty is not None: train_reg_loss = 0.0 batch_reg_loss = 0.0 else: train_reg_loss = None batch_reg_loss = None # Set the model to "train" mode. self.model_engine.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown batch_group_generator_tqdm = batch_group_generator if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss = loss.item() train_loss += batch_loss if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss self.model_engine.backward(loss) self.model_engine.step() param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if self._master: # Updating tqdm only for the master as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch( self.model, self.optimizer, 0., # batch_grad_norm, metrics, batch_group, param_updates, ) self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, is_master=self._master, ) metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics