Beispiel #1
0
 def validate_epoch(self):
     all_losses_and_metrics = defaultdict(list)
     self.metrics.reset()
     self.model.eval()
     for model_input in self.progress(self.validate_loader,
                                      tag="validation"):
         with torch.no_grad():
             model_input = to_device(model_input, self.device)
             model_output = Dict(self.model(model_input))
             losses = self.loss(model_input, model_output)
             self.metrics.update(model_input, model_output)
             all_losses_and_metrics["loss"].append(losses.loss.item())
             for key in losses.unweighted_losses:
                 all_losses_and_metrics[key].append(
                     losses.unweighted_losses[key].item())
     # Compute mean for all losses
     all_losses_and_metrics = Dict(
         {key: np.mean(val)
          for key, val in all_losses_and_metrics.items()})
     all_losses_and_metrics.update(Dict(self.metrics.evaluate()))
     self.log_validation_losses_and_metrics(all_losses_and_metrics)
     # Store the validation loss in cache. This will be used for checkpointing.
     self.write_to_cache("current_validation_metrics",
                         all_losses_and_metrics)
     self.write_to_cache("current_validation_loss",
                         all_losses_and_metrics.loss)
     return all_losses_and_metrics
Beispiel #2
0
 def validate_epoch(self):
     all_losses_and_metrics = defaultdict(list)
     self.model.eval()
     for model_input in self.progress(self.validate_loader,
                                      tag="validation"):
         with torch.no_grad():
             model_input = to_device(model_input, self.device)
             model_output = Dict(self.model(model_input))
             losses = self.loss(model_input, model_output)
             all_losses_and_metrics["loss"].append(losses.loss.item())
             for key in losses.unweighted_losses:
                 all_losses_and_metrics[key].append(
                     losses.unweighted_losses[key].item())
     # Compute mean for all losses
     all_losses_and_metrics = Dict(
         {key: np.mean(val)
          for key, val in all_losses_and_metrics.items()})
     self.log_validation_losses_and_metrics(all_losses_and_metrics)
     early_stopping_metric = all_losses_and_metrics[self.get(
         "training/checkpoint/early_stopping_metric", "loss")]
     assert early_stopping_metric != {}, (
         f"Could not fetch metric at key: "
         f"{self.get('training/checkpoint/early_stopping_metric', 'loss')}. "
         f"Possible keys are: {all_losses_and_metrics.keys()}")
     # Store the validation loss in cache. This will be used for checkpointing.
     self.write_to_cache("current_validation_metrics",
                         all_losses_and_metrics)
     self.write_to_cache("current_validation_loss",
                         all_losses_and_metrics.loss)
     self.write_to_cache("current_early_stopping_metric",
                         early_stopping_metric)
     return all_losses_and_metrics
Beispiel #3
0
 def train_epoch(self):
     self.clear_moving_averages()
     self.model.train()
     for model_input in self.progress(self.train_loader, tag="train"):
         # Evaluate model
         model_input = to_device(model_input, self.device)
         model_output = Dict(self.model(model_input))
         # Compute loss
         losses = self.loss(model_input, model_output)
         loss = losses.loss
         self.optim.zero_grad()
         loss.backward()
         self.optim.step()
         # Log to wandb (if required)
         self.log_training_losses(losses)
         self.log_learning_rates()
         # Log to pbar
         self.accumulate_in_cache("moving_loss", loss.item(),
                                  momentum_accumulator(0.9))
         self.log_progress(
             "train",
             loss=self.read_from_cache("moving_loss"),
         )
         self.next_step()
Beispiel #4
0
 def _build_model(self):
     self.model: nn.Module = to_device(
         ContactTracingTransformer(**self.get("model/kwargs", {})),
         self.device)
Beispiel #5
0
 def _build_model(self):
     model_cls = getattr(
         models, self.get("model/name", "ContactTracingTransformer"))
     self.model: nn.Module = to_device(
         model_cls(**self.get("model/kwargs", {})), self.device)
Beispiel #6
0
 def train_epoch(self):
     self.clear_moving_averages()
     self.model.train()
     batch_count = 0
     for model_input in self.progress(self.train_loader, tag="train"):
         # Push to echo buffer
         if self.echo_data:
             self.push_to_echo_buffer(model_input)
         if self.echo_data:
             self.optim.zero_grad()
             # First, train with fresh data
             model_input = to_device(model_input, self.device)
             model_output = Dict(self.model(model_input))
             # Compute loss; this is the loss that will be reported to
             # the logger.
             losses = self.loss(model_input, model_output)
             loss = losses.loss
             loss.backward()
             if self.get("training/echo/step_on_echo", False):
                 self.optim.step()
             for echo_idx in range(
                     self.get("training/echo/num_echoes",
                              ensure_exists=True)):
                 echoed_model_input = self.fetch_from_echo_buffer()
                 if echoed_model_input is None:
                     # This can happen when there are not enough samples in
                     # the echo buffer.
                     break
                 if self.get("training/echo/step_on_echo", False):
                     self.optim.zero_grad()
                 echoed_model_input = to_device(echoed_model_input,
                                                self.device)
                 echoed_model_output = Dict(self.model(echoed_model_input))
                 echoed_losses = self.loss(echoed_model_input,
                                           echoed_model_output)
                 echoed_loss = echoed_losses.loss
                 echoed_loss.backward()
                 if self.get("training/echo/step_on_echo", False):
                     self.optim.step()
             # If we haven't stepped already, it's time
             if not self.get("training/echo/step_on_echo", False):
                 self.optim.step()
         else:
             self.optim.zero_grad()
             # Evaluate model
             model_input = to_device(model_input, self.device)
             model_output = Dict(self.model(model_input))
             # Compute loss
             losses = self.loss(model_input, model_output)
             loss = losses.loss
             loss.backward()
             self.optim.step()
         # Log to wandb (if required)
         self.log_training_losses(losses)
         self.log_learning_rates()
         # Log to pbar
         self.accumulate_in_cache("moving_loss", loss.item(),
                                  momentum_accumulator(0.9))
         self.log_progress(
             "train",
             loss=self.read_from_cache("moving_loss"),
         )
         yield_condition = (
             self.get("training/break_epoch_every") is not None
             and batch_count > 0 and
             (batch_count % self.get("training/break_epoch_every") == 0))
         if yield_condition:
             yield
         batch_count += 1
         self.next_step()
     yield