def validate_epoch(self): all_losses_and_metrics = defaultdict(list) self.metrics.reset() self.model.eval() for model_input in self.progress(self.validate_loader, tag="validation"): with torch.no_grad(): model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) losses = self.loss(model_input, model_output) self.metrics.update(model_input, model_output) all_losses_and_metrics["loss"].append(losses.loss.item()) for key in losses.unweighted_losses: all_losses_and_metrics[key].append( losses.unweighted_losses[key].item()) # Compute mean for all losses all_losses_and_metrics = Dict( {key: np.mean(val) for key, val in all_losses_and_metrics.items()}) all_losses_and_metrics.update(Dict(self.metrics.evaluate())) self.log_validation_losses_and_metrics(all_losses_and_metrics) # Store the validation loss in cache. This will be used for checkpointing. self.write_to_cache("current_validation_metrics", all_losses_and_metrics) self.write_to_cache("current_validation_loss", all_losses_and_metrics.loss) return all_losses_and_metrics
def validate_epoch(self): all_losses_and_metrics = defaultdict(list) self.model.eval() for model_input in self.progress(self.validate_loader, tag="validation"): with torch.no_grad(): model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) losses = self.loss(model_input, model_output) all_losses_and_metrics["loss"].append(losses.loss.item()) for key in losses.unweighted_losses: all_losses_and_metrics[key].append( losses.unweighted_losses[key].item()) # Compute mean for all losses all_losses_and_metrics = Dict( {key: np.mean(val) for key, val in all_losses_and_metrics.items()}) self.log_validation_losses_and_metrics(all_losses_and_metrics) early_stopping_metric = all_losses_and_metrics[self.get( "training/checkpoint/early_stopping_metric", "loss")] assert early_stopping_metric != {}, ( f"Could not fetch metric at key: " f"{self.get('training/checkpoint/early_stopping_metric', 'loss')}. " f"Possible keys are: {all_losses_and_metrics.keys()}") # Store the validation loss in cache. This will be used for checkpointing. self.write_to_cache("current_validation_metrics", all_losses_and_metrics) self.write_to_cache("current_validation_loss", all_losses_and_metrics.loss) self.write_to_cache("current_early_stopping_metric", early_stopping_metric) return all_losses_and_metrics
def train_epoch(self): self.clear_moving_averages() self.model.train() for model_input in self.progress(self.train_loader, tag="train"): # Evaluate model model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) # Compute loss losses = self.loss(model_input, model_output) loss = losses.loss self.optim.zero_grad() loss.backward() self.optim.step() # Log to wandb (if required) self.log_training_losses(losses) self.log_learning_rates() # Log to pbar self.accumulate_in_cache("moving_loss", loss.item(), momentum_accumulator(0.9)) self.log_progress( "train", loss=self.read_from_cache("moving_loss"), ) self.next_step()
def _build_model(self): self.model: nn.Module = to_device( ContactTracingTransformer(**self.get("model/kwargs", {})), self.device)
def _build_model(self): model_cls = getattr( models, self.get("model/name", "ContactTracingTransformer")) self.model: nn.Module = to_device( model_cls(**self.get("model/kwargs", {})), self.device)
def train_epoch(self): self.clear_moving_averages() self.model.train() batch_count = 0 for model_input in self.progress(self.train_loader, tag="train"): # Push to echo buffer if self.echo_data: self.push_to_echo_buffer(model_input) if self.echo_data: self.optim.zero_grad() # First, train with fresh data model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) # Compute loss; this is the loss that will be reported to # the logger. losses = self.loss(model_input, model_output) loss = losses.loss loss.backward() if self.get("training/echo/step_on_echo", False): self.optim.step() for echo_idx in range( self.get("training/echo/num_echoes", ensure_exists=True)): echoed_model_input = self.fetch_from_echo_buffer() if echoed_model_input is None: # This can happen when there are not enough samples in # the echo buffer. break if self.get("training/echo/step_on_echo", False): self.optim.zero_grad() echoed_model_input = to_device(echoed_model_input, self.device) echoed_model_output = Dict(self.model(echoed_model_input)) echoed_losses = self.loss(echoed_model_input, echoed_model_output) echoed_loss = echoed_losses.loss echoed_loss.backward() if self.get("training/echo/step_on_echo", False): self.optim.step() # If we haven't stepped already, it's time if not self.get("training/echo/step_on_echo", False): self.optim.step() else: self.optim.zero_grad() # Evaluate model model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) # Compute loss losses = self.loss(model_input, model_output) loss = losses.loss loss.backward() self.optim.step() # Log to wandb (if required) self.log_training_losses(losses) self.log_learning_rates() # Log to pbar self.accumulate_in_cache("moving_loss", loss.item(), momentum_accumulator(0.9)) self.log_progress( "train", loss=self.read_from_cache("moving_loss"), ) yield_condition = ( self.get("training/break_epoch_every") is not None and batch_count > 0 and (batch_count % self.get("training/break_epoch_every") == 0)) if yield_condition: yield batch_count += 1 self.next_step() yield