def send(self, signal: Signal) -> None: """ AUTHORS: -------- :author: Alix Leroy DESCRIPTION: ------------ Send the signal to the appropriate receivers PARAMETERS: ----------- :param signal: (Signal): A signal to send RETURN: ------- :return: None """ # Get event and arguments from the signal to send event = signal.get_event() args = signal.get_arguments() # If the event in the list broadcast the signal if event.get_index() in self.connections: for connection in self.connections[event.get_index()]: receiver = connection.get_receiver() expected_arguments = connection.get_expected_arguments() # If only some specific keys have to be kept if expected_arguments is not None: args = self.keep_arguments( receiver=receiver, expected_arguments=expected_arguments, arguments=args) receiver()( **args ) # Need twice the brackets because of the weak method reference # Else display an error notification else: Notification( DEEP_NOTIF_ERROR, "The following event '%s' is not connected to any receiver." % str(event.get_description()))
def send_epoch_end_signal(self, **kwargs): kwargs["epoch_index"] = self.epoch kwargs["loss"] = self.train_loss kwargs["losses"] = self.train_losses kwargs["metrics"] = self.train_metrics Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_EPOCH_END, args=kwargs))
def send_save_params(self, inp=None) -> None: """ AUTHORS: -------- :author: Samuel Westlake :author: Alix Leroy DESCRIPTION: ------------ Send the saving parameters to the Saver PARAMETERS: ----------- :param inp: The input size of the model (required for ONNX models) RETURN: ------- :return: None """ Thalamus().add_signal( Signal(event=DEEP_EVENT_SEND_SAVE_PARAMS_FROM_TRAINER, args={ "model": self.model, "optimizer": self.optimizer, "epoch_index": self.epoch, "validation_loss": self.validation_loss, "inp": inp }))
def saving_required(self, saving_required: bool): """ AUTHORS: -------- :author: Alix Leroy DESCRIPTION: ------------ Signal to send the model to be saved if require NB : Contains a signal, cannot be static PARAMETERS: ----------- :param saving_required: (bool): Whether saving the model is required or not RETURN: ------- None """ if saving_required is True: Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_SAVE_MODEL, args={}))
def send_training_loss(self): Thalamus().add_signal( Signal(event=DEEP_EVENT_SEND_TRAINING_LOSS, args={ DEEP_LOG_VALIDATION.var_name: self._loss_data[DEEP_LOG_VALIDATION.var_name] }))
def save_model(self): Thalamus().add_signal( signal=Signal( event=DEEP_EVENT_SAVE_MODEL, args={} ) )
def saving_required(self, saving_required: bool): """ :param saving_required: :return: """ if saving_required is True: Thalamus().add_signal(signal= Signal(event=DEEP_EVENT_SAVE_MODEL, args={"model": self.model}))
def __compute_overwatch_metric(self, num_minibatches_training, running_total_loss, running_losses, running_metrics, total_validation_loss, result_validation_losses, result_validation_metrics) -> None: # If the validation loss is None (No validation) we take the metric from the training as overwatch metric if total_validation_loss is None: data = dict([(TOTAL_LOSS, running_total_loss / num_minibatches_training)] + [(loss_name, value.item() / num_minibatches_training) for (loss_name, value) in running_losses.items()] + [(metric_name, value / num_minibatches_training) for (metric_name, value) in running_metrics.items()]) for key, value in data.items(): if key == self.overwatch_metric.get_name(): self.overwatch_metric.set_value(value) break else: data = dict([(TOTAL_LOSS, total_validation_loss)] + [(loss_name, value.item()) for (loss_name, value) in result_validation_losses.items()] + [(metric_name, value / num_minibatches_training) for (metric_name, value) in result_validation_metrics.items()]) for key, value in data.items(): if key == self.overwatch_metric.get_name(): self.overwatch_metric.set_value(value) break Thalamus().add_signal( Signal(event=DEEP_EVENT_OVERWATCH_METRIC_COMPUTED, args={ "current_overwatch_metric": copy.deepcopy(self.overwatch_metric) }))
def send_validation_end_signal(**kwargs): Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_VALIDATION_END, args=kwargs))
def send_training_end_signal(**kwargs): Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_TRAINING_END, args=kwargs))
def send_batch_end_signal(**kwargs): Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_BATCH_END, args=kwargs))
def send_batch_start_signal(**kwargs): Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_BATCH_START, args=kwargs))
def __compute_overwatch_metric(self, num_minibatches_training, running_total_loss, running_losses, running_metrics, total_validation_loss, result_validation_losses, result_validation_metrics) -> None: """ :author: Alix Leroy DESCRIPTION: ------------ Compute the overwatch metric and send it to the saver PARAMETERS: ----------- :param num_minibatches_training: :param running_total_loss: :param running_losses: :param running_metrics: :param total_validation_loss: :param result_validation_losses: :param result_validation_metrics: RETURN: ------- :return: """ # If the validation loss is None (No validation) we take the metric from the training as overwatch metric if total_validation_loss is None: data = dict([(TOTAL_LOSS, running_total_loss / num_minibatches_training)] + [(loss_name, value.item() / num_minibatches_training) for (loss_name, value) in running_losses.items()] + [(metric_name, value / num_minibatches_training) for (metric_name, value) in running_metrics.items()]) for key, value in data.items(): if key == self.overwatch_metric.get_name(): self.overwatch_metric.set_value(value) break else: data = dict([(TOTAL_LOSS, total_validation_loss)] + [(loss_name, value.item()) for (loss_name, value) in result_validation_losses.items()] + [(metric_name, value / num_minibatches_training) for (metric_name, value) in result_validation_metrics.items()]) for key, value in data.items(): if key == self.overwatch_metric.get_name(): self.overwatch_metric.set_value(value) break Thalamus().add_signal( Signal(event=DEEP_EVENT_OVERWATCH_METRIC_COMPUTED, args={ "current_overwatch_metric": copy.deepcopy(self.overwatch_metric) }))
def __train(self, first_training: bool = True) -> None: """ AUTHORS: -------- :author: Alix Leroy DESCRIPTION: ------------ Loop over the dataset to train the network PARAMETERS: ----------- :param first_training (bool): Whether more epochs have been required after initial training or not RETURN: ------- :return: None """ if first_training is True: Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_ON_TRAINING_START, args={})) for self.epoch in range(self.initial_epoch + 1, self.num_epochs + 1): Thalamus().add_signal( signal=Signal(event=DEEP_EVENT_ON_EPOCH_START, args={ "epoch_index": self.epoch, "num_epochs": self.num_epochs })) # Shuffle the data if required if self.shuffle_method is not None: self.dataset.shuffle(self.shuffle_method) # Put model into train mode for the start of the epoch self.model.train() for minibatch_index, minibatch in enumerate(self.dataloader, 0): # Clean the given data inputs, labels, additional_data = self.clean_single_element_list( minibatch) # zero the parameter gradients self.optimizer.zero_grad() # Set the data to the corresponding device inputs = self.to_device(inputs, self.model.device) labels = self.to_device(labels, self.model.device) additional_data = self.to_device(additional_data, self.model.device) # Infer the output of the batch try: outputs = self.model(*inputs) except RuntimeError as e: Notification(DEEP_NOTIF_FATAL, "RuntimeError : %s" % str(e)) except TypeError as e: Notification(DEEP_NOTIF_FATAL, "TypeError : %s" % str(e)) # Compute losses and metrics result_losses = self.compute_metrics(self.losses, inputs, outputs, labels, additional_data) result_metrics = self.compute_metrics(self.metrics, inputs, outputs, labels, additional_data) # Add weights to losses result_losses = dict_utils.apply_weight( result_losses, vars(self.losses)) # Sum all the result of the losses total_loss = sum_dict(result_losses) # Accumulates the gradient (by addition) for each parameter total_loss.backward() # Performs a parameter update based on the current gradient (stored in .grad attribute of a parameter) # and the update rule self.optimizer.step() # Detach the tensors from the network outputs, total_loss, result_losses, result_metrics = self.detach( outputs=outputs, total_loss=total_loss, result_losses=result_losses, result_metrics=result_metrics) # Send signal batch end Thalamus().add_signal( Signal(event=DEEP_EVENT_ON_BATCH_END, args={ "minibatch_index": minibatch_index + 1, "num_minibatches": self.num_minibatches, "epoch_index": self.epoch, "total_loss": total_loss.item(), "result_losses": result_losses, "result_metrics": result_metrics })) # Reset the dataset (transforms cache) self.dataset.reset() # Evaluate the model self.validation_loss, result_validation_losses, result_validation_metrics = self.__evaluate_epoch( ) if self.tester is not None: num_minibatches_validation = self.tester.get_num_minibatches() else: num_minibatches_validation = None # Send signal epoch end Thalamus().add_signal( Signal(event=DEEP_EVENT_ON_EPOCH_END, args={ "epoch_index": self.epoch, "num_epochs": self.num_epochs, "model": weakref.ref(self.model), "num_minibatches": self.num_minibatches, "total_validation_loss": self.validation_loss, "result_validation_losses": result_validation_losses, "result_validation_metrics": result_validation_metrics, "num_minibatches_validation": num_minibatches_validation, })) # Send signal end training Thalamus().add_signal( Signal(event=DEEP_EVENT_ON_TRAINING_END, args={"model": self.model}))
def is_saving_required(self, current_overwatch_metric: OverWatchMetric) -> bool: """ AUTHORS: -------- :author: Alix Leroy DESCRIPTION: ------------ Check if saving the model is required PARAMETERS: ----------- :param current_overwatch_metric_value->float: The value of the metric to over watch RETURN: ------- :return->bool: Whether the model should be saved or not """ save = False # Do not save at the first epoch if self.best_overwatch_metric is None: self.best_overwatch_metric = current_overwatch_metric save = False # If the new metric has to be smaller than the best one if current_overwatch_metric.get_condition() == DEEP_COMPARE_SMALLER: # If the model improved since last batch => Save if self.best_overwatch_metric.get_value( ) > current_overwatch_metric.get_value(): self.best_overwatch_metric = current_overwatch_metric save = True # No improvement => Return False else: save = False # If the new metric has to be bigger than the best one (e.g. The accuracy of a classification) elif current_overwatch_metric.get_condition() == DEEP_COMPARE_BIGGER: # If the model improved since last batch => Save if self.best_overwatch_metric.get_value( ) < current_overwatch_metric.get_value(): self.best_overwatch_metric = current_overwatch_metric save = True # No improvement => Return False else: save = False else: Notification( DEEP_NOTIF_FATAL, "The following saving condition does not exist : " + str("test")) Thalamus().add_signal(signal=Signal(event=DEEP_EVENT_SAVING_REQUIRED, args={"saving_required": save}))
def on_batch_end(self, minibatch_index: int, num_minibatches: int, epoch_index: int, total_loss: int, result_losses: dict, result_metrics: dict): """ AUTHORS: -------- :author: Alix Leroy DESCRIPTION: ------------ Called at the end of every batch PARAMETERS: ----------- :param minibatch_index: int: Index of the current minibatch :param num_minibatches: int: Number of minibatches per epoch :param epoch_index: int: Index of the current epoch :param total_loss: int: The total loss :param result_losses: dict: List of resulting losses :param result_metrics: dict: List of resulting metrics RETURN: ------- :return: None """ # Save the running metrics self.running_total_loss = self.running_total_loss + total_loss self.running_losses = merge_sum_dict(self.running_losses, result_losses) self.running_metrics = merge_sum_dict(self.running_metrics, result_metrics) # If the user wants to print stats for each batch if DEEP_VERBOSE_BATCH.corresponds(self.verbose): # Print training loss and metrics on batch end Thalamus().add_signal( Signal(event=DEEP_EVENT_PRINT_TRAINING_BATCH_END, args={ "losses": result_losses, "total_loss": total_loss, "metrics": result_metrics, "num_minibatches": num_minibatches, "minibatch_index": minibatch_index })) # Save the data in memory if DEEP_MEMORIZE_BATCHES.corresponds(self.memorize): # Save the history in memory data = [datetime.datetime.now().strftime(TIME_FORMAT), self.__time(), epoch_index, minibatch_index, total_loss] + \ [value.item() for (loss_name, value) in result_losses.items()] + \ [value for (metric_name, value) in result_metrics.items()] self.train_batches_history.put(data) # Save the history after 10 batches if self.train_batches_history.qsize() > 10: self.save(only_batches=True)
def on_epoch_end(self, epoch_index: int, num_epochs: int, num_minibatches: int, total_validation_loss: int, result_validation_losses: dict, result_validation_metrics: dict, num_minibatches_validation: int): """ AUTHORS: -------- :author: Alix Leroy :author: Samuel Westlake DESCRIPTION: ------------ Method for managing history at the end of each epoch PARAMETERS: ----------- :param epoch_index: int: current epoch index :param num_epochs: int: total number of epoch :param num_minibatches: int: number of minibatches per epoch :param total_validation_loss: :param result_validation_losses: :param result_validation_metrics: :param num_minibatches_validation: RETURN: ------- :return: None """ # MANAGE TRAINING HISTORY if DEEP_VERBOSE_EPOCH.corresponds( self.verbose) or DEEP_VERBOSE_BATCH.corresponds(self.verbose): # Print the training loss and metrics on epoch end Thalamus().add_signal( Signal(event=DEEP_EVENT_PRINT_TRAINING_EPOCH_END, args={ "losses": { key: value / num_minibatches for key, value in self.running_losses.items() }, "total_loss": self.running_total_loss / num_minibatches, "metrics": { key: value / num_minibatches for key, value in self.running_metrics.items() }, })) # If recording on batch or epoch if DEEP_MEMORIZE_BATCHES.corresponds( self.memorize) or DEEP_MEMORIZE_EPOCHS.corresponds( self.memorize): data = [ datetime.datetime.now().strftime(TIME_FORMAT), self.__time(), epoch_index, self.running_total_loss / num_minibatches ]\ + [value.item() / num_minibatches for (loss_name, value) in self.running_losses.items()]\ + [value / num_minibatches for (metric_name, value) in self.running_metrics.items()] self.train_epochs_history.put(data) self.running_total_loss = 0 self.running_losses = {} self.running_metrics = {} # MANAGE VALIDATION HISTORY if total_validation_loss is not None: if DEEP_VERBOSE_EPOCH.corresponds( self.verbose) or DEEP_VERBOSE_BATCH.corresponds( self.verbose): # Print the validation loss and metrics on epoch end Thalamus().add_signal( Signal(event=DEEP_EVENT_PRINT_VALIDATION_EPOCH_END, args={ "losses": result_validation_losses, "total_loss": total_validation_loss, "metrics": result_validation_metrics, })) if DEEP_MEMORIZE_BATCHES.corresponds( self.memorize) or DEEP_MEMORIZE_EPOCHS.corresponds( self.memorize): data = [ datetime.datetime.now().strftime(TIME_FORMAT), self.__time(), epoch_index, total_validation_loss ] \ + [value.item() for (loss_name, value) in result_validation_losses.items()] \ + [value for (metric_name, value) in result_validation_metrics.items()] self.validation_history.put(data) if DEEP_SAVE_SIGNAL_AUTO.corresponds(self.save_signal): self.__compute_overwatch_metric( num_minibatches_training=num_minibatches, running_total_loss=self.running_total_loss, running_losses=self.running_losses, running_metrics=self.running_metrics, total_validation_loss=total_validation_loss, result_validation_losses=result_validation_losses, result_validation_metrics=result_validation_metrics) elif DEEP_SAVE_SIGNAL_END_EPOCH.corresponds(self.save_signal): Thalamus().add_signal(Signal(event=DEEP_EVENT_SAVE_MODEL, args={})) Notification(DEEP_NOTIF_SUCCESS, EPOCH_END % (epoch_index, num_epochs)) self.save()
def save_model(self) -> None: """ AUTHORS: -------- :author: Alix Leroy :author: Samuel Westlake DESCRIPTION: ------------ Save the model PARAMETERS: ----------- RETURN: ------- :return: None """ # Set training_loss Thalamus().add_signal( Signal(event=DEEP_EVENT_REQUEST_TRAINING_LOSS, args=[])) # Set model and stuff Thalamus().add_signal( Signal(event=DEEP_EVENT_REQUEST_SAVE_PARAMS_FROM_TRAINER, args=[])) file_path = self.__get_file_path() # If we want to save to the pytorch format if DEEP_SAVE_FORMAT_PYTORCH.corresponds(self.method): # TODO: Finish try except statements here after testing... # try: torch.save( { "model_state_dict": self.model.state_dict(), "epoch": self.epoch_index, "training_loss": self.training_loss, "validation_loss": self.validation_loss, "optimizer_state_dict": self.optimizer.state_dict() }, file_path) # except: # Notification(DEEP_NOTIF_ERROR, "Error while saving the pytorch model and weights" ) # self.__handle_error_saving(model) # If we want to save to the ONNX format elif DEEP_SAVE_FORMAT_ONNX.corresponds(self.method): # TODO: and here. Also fix onnx export function Notification(DEEP_NOTIF_FATAL, "Save as onnx format not implemented yet") # try: # torch.onnx._export(model, inp, file_path, # export_params=True, # verbose=True, # input_names=input_names, # output_names=output_names) # except: # Notification(DEEP_NOTIF_ERROR, "Error while saving the ONNX model and weights" ) # self.__handle_error_saving(model) Notification(DEEP_NOTIF_SUCCESS, DEEP_MSG_MODEL_SAVED % file_path)
def send_training_loss(self): Thalamus().add_signal( Signal(event=DEEP_EVENT_SEND_TRAINING_LOSS, args={"training_loss": self.running_total_loss}))
def __train(self, first_training=True)->None: """ AUTHORS: -------- :author: Alix Leroy DESCRIPTION: ------------ Loop over the dataset to train the network PARAMETERS: ----------- :param first_training->bool: Whether more epochs have been required after initial training or not RETURN: ------- :return: None """ if first_training is True: Thalamus().add_signal(signal=Signal(event=DEEP_EVENT_ON_TRAINING_START, args={})) else: self.callbacks.unpause() for epoch in range(self.initial_epoch+1, self.num_epochs+1): # loop over the dataset multiple times Thalamus().add_signal(signal=Signal(event=DEEP_EVENT_ON_EPOCH_START, args={"epoch_index": epoch, "num_epochs": self.num_epochs})) for minibatch_index, minibatch in enumerate(self.dataloader, 0): # Clean the given data inputs, labels, additional_data = self.clean_single_element_list(minibatch) # zero the parameter gradients self.optimizer.zero_grad() # Infer the output of the batch outputs = self.model(*inputs) # Compute losses and metrics result_losses = self.compute_metrics(self.losses, inputs, outputs, labels, additional_data) result_metrics = self.compute_metrics(self.metrics, inputs, outputs, labels, additional_data) # Add weights to losses result_losses = apply_weight(result_losses, self.losses) # Sum all the result of the losses total_loss = sum_dict(result_losses) # Accumulates the gradient (by addition) for each parameter total_loss.backward() # Performs a parameter update based on the current gradient (stored in .grad attribute of a parameter) # and the update rule self.optimizer.step() outputs, total_loss, result_losses, result_metrics = self.detach(outputs=outputs, total_loss=total_loss, result_losses=result_losses, result_metrics=result_metrics) # Send signal batch end Thalamus().add_signal(Signal(event= DEEP_EVENT_ON_BATCH_END, args={"minibatch_index": minibatch_index+1, "num_minibatches": self.num_minibatches, "epoch_index": epoch, "total_loss": total_loss.item(), "result_losses": result_losses, "result_metrics": result_metrics })) # Shuffle the data if required if self.shuffle is not None: self.dataset.shuffle(self.shuffle) # Reset the dataset (transforms cache) self.dataset.reset() # Evaluate the model total_validation_loss, result_validation_losses, result_validation_metrics = self.__evaluate_epoch() # Send signal epoch end Thalamus().add_signal(Signal(event=DEEP_EVENT_ON_EPOCH_END, args={"epoch_index": epoch, "num_epochs" : self.num_epochs, "model" : self.model, "num_minibatches" : self.num_minibatches, "total_validation_loss" : total_validation_loss.item(), "result_validation_losses" : result_validation_losses, "result_validation_metrics" : result_validation_metrics, "num_minibatches_validation" : self.tester.get_num_minibatches() })) # Send signal end training Thalamus().add_signal(Signal(event=DEEP_EVENT_ON_TRAINING_END, args={"model" : self.model})) # Pause callbacks which compute time self.callbacks.pause()