def test_epoch( test_epoch: int, run_recovery: Optional[RunRecovery]) -> Optional[MetricsDict]: pipeline = create_inference_pipeline(config, test_epoch, run_recovery) if pipeline is None: return None # for mypy assert isinstance(pipeline, ScalarInferencePipelineBase) ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing") ds = config.get_torch_dataset_for_inference(data_split).as_data_loader( shuffle=False, batch_size=1, num_dataload_workers=0) logging.info( f"Starting to evaluate model from epoch {test_epoch} on {data_split.value} set." ) metrics_dict = create_metrics_dict_from_config(config) for sample in ds: result = pipeline.predict(sample) # Since batch size is 1, we only have 1 item in each of the fields in result sample_id, label_gpu, model_output = result.subject_ids[ 0], result.labels, result.model_outputs compute_scalar_metrics(metrics_dict, [sample_id], model_output, label_gpu, config.loss_type) logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}") average = metrics_dict.average(across_hues=False) logging.info(average.to_string()) return metrics_dict
def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]: pipeline = create_inference_pipeline(config=config, checkpoint_paths=checkpoint_paths) if pipeline is None: return None # for mypy assert isinstance(pipeline, ScalarInferencePipelineBase) ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing") ds = config.get_torch_dataset_for_inference(data_split).as_data_loader( shuffle=False, batch_size=1, num_dataload_workers=0 ) logging.info(f"Starting to evaluate model on {data_split.value} set.") metrics_dict = create_metrics_dict_for_scalar_models(config) for sample in ds: result = pipeline.predict(sample) model_output = result.posteriors label = result.labels.to(device=model_output.device) sample_id = result.subject_ids[0] compute_scalar_metrics(metrics_dict, subject_ids=[sample_id], model_output=model_output, labels=label, loss_type=config.loss_type) logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}") average = metrics_dict.average(across_hues=False) logging.info(average.to_string()) return metrics_dict
def update_metrics(self, subject_ids: List[str], model_output: torch.Tensor, labels: torch.Tensor) -> None: """ Handle metrics updates based on the provided model outputs and labels. """ compute_scalar_metrics(self.metrics, subject_ids, model_output, labels, self.model_config.loss_type)
def _compute_scalar_metrics( output_values_list: List[List[float]], labels: List[List[float]], is_classification: bool, hues: Optional[List[str]] = None) -> ScalarMetricsDict: model_output = torch.tensor(output_values_list) _labels = torch.tensor(labels) if machine_has_gpu: _labels = _labels.cuda() model_output = model_output.cuda() metrics_dict = ScalarMetricsDict( hues=hues, is_classification_metrics=is_classification) subject_ids = list(range(model_output.shape[0])) loss_type = ScalarLoss.BinaryCrossEntropyWithLogits if is_classification else ScalarLoss.MeanSquaredError compute_scalar_metrics(metrics_dict, subject_ids, model_output, _labels, loss_type=loss_type) return metrics_dict
def classification_model_test( config: ScalarModelBase, data_split: ModelExecutionMode, checkpoint_paths: List[Path], model_proc: ModelProcessing, cross_val_split_index: int) -> InferenceMetricsForClassification: """ The main testing loop for classification models. It runs a loop over all epochs for which testing should be done. It loads the model and datasets, then proceeds to test the model for all requested checkpoints. :param config: The model configuration. :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir, used mainly in model evaluation using different dataset splits. :param checkpoint_paths: Checkpoint paths to initialize model :param model_proc: whether we are testing an ensemble or single model :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs. """ pipeline = create_inference_pipeline(config=config, checkpoint_paths=checkpoint_paths) if pipeline is None: raise ValueError("Inference pipeline could not be created.") # for mypy assert isinstance(pipeline, ScalarInferencePipelineBase) ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing") ds = config.get_torch_dataset_for_inference(data_split).as_data_loader( shuffle=False, batch_size=1, num_dataload_workers=0) logging.info(f"Starting to evaluate model on {data_split.value} set.") results_folder = config.outputs_folder / get_best_epoch_results_path( data_split, model_proc) os.makedirs(str(results_folder), exist_ok=True) metrics_dict = create_metrics_dict_for_scalar_models(config) output_logger: Optional[DataframeLogger] = DataframeLogger( csv_path=results_folder / MODEL_OUTPUT_CSV) for sample in ds: result = pipeline.predict(sample) model_output = result.posteriors label = result.labels.to(device=model_output.device) sample_id = result.subject_ids[0] if output_logger: for i in range(len(config.target_names)): output_logger.add_record({ LoggingColumns.Patient.value: sample_id, LoggingColumns.Hue.value: config.target_names[i], LoggingColumns.Label.value: label[0][i].item(), LoggingColumns.ModelOutput.value: model_output[0][i].item(), LoggingColumns.CrossValidationSplitIndex.value: cross_val_split_index }) compute_scalar_metrics(metrics_dict, subject_ids=[sample_id], model_output=model_output, labels=label, loss_type=config.loss_type) logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}") average = metrics_dict.average(across_hues=False) logging.info(average.to_string()) if isinstance(metrics_dict, ScalarMetricsDict): csv_file = results_folder / SUBJECT_METRICS_FILE_NAME logging.info( f"Writing {data_split.value} metrics to file {str(csv_file)}") # If we are running inference after a training run, the validation set metrics may have been written # during train time. If this is not the case, or we are running on the test set, create the metrics # file. if not csv_file.exists(): df_logger = DataframeLogger(csv_file) # For test if ensemble split should be default, else record which fold produced this prediction cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \ else cross_val_split_index metrics_dict.store_metrics_per_subject( df_logger=df_logger, mode=data_split, cross_validation_split_index=cv_index, epoch=BEST_EPOCH_FOLDER_NAME) # write to disk df_logger.flush() if output_logger: output_logger.flush() return InferenceMetricsForClassification(metrics=metrics_dict)