def test_get_image_filepath_from_subject_id_single( test_output_dirs: OutputFolderForTests) -> None: config = ScalarModelBase(image_file_column="filePath", label_value_column="label", subject_column="subject") config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv = config.local_dataset / "dataset.csv" image_file_name = "image.npy" dataset_csv.write_text(f"subject,filePath,label\n" f"0,0_{image_file_name},0\n" f"1,1_{image_file_name},1\n") df = config.read_dataset_if_needed() dataset = ScalarDataset(args=config, data_frame=df) Path(config.local_dataset / f"0_{image_file_name}").touch() Path(config.local_dataset / f"1_{image_file_name}").touch() filepath = get_image_filepath_from_subject_id(subject_id="1", dataset=dataset, config=config) expected_path = Path(config.local_dataset / f"1_{image_file_name}") assert filepath assert len(filepath) == 1 assert expected_path.samefile(filepath[0]) # Check error is raised if the subject does not exist with pytest.raises(ValueError) as ex: get_image_filepath_from_subject_id(subject_id="100", dataset=dataset, config=config) assert "Could not find subject" in str(ex)
def test_get_image_filepath_from_subject_id_with_image_channels( test_output_dirs: OutputFolderForTests) -> None: config = ScalarModelBase(label_channels=["label"], image_file_column="filePath", label_value_column="label", image_channels=["image"], subject_column="subject") config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv = config.local_dataset / "dataset.csv" image_file_name = "image.npy" dataset_csv.write_text(f"subject,channel,filePath,label\n" f"0,label,,0\n" f"0,image,0_{image_file_name},\n" f"1,label,,1\n" f"1,image,1_{image_file_name},\n") df = config.read_dataset_if_needed() dataset = ScalarDataset(args=config, data_frame=df) Path(config.local_dataset / f"0_{image_file_name}").touch() Path(config.local_dataset / f"1_{image_file_name}").touch() filepath = get_image_filepath_from_subject_id(subject_id="1", dataset=dataset, config=config) expected_path = Path(config.local_dataset / f"1_{image_file_name}") assert filepath assert len(filepath) == 1 assert filepath[0].samefile(expected_path)
def test_get_labels_for_imbalanced_sampler_multilabel(test_output_dirs: OutputFolderForTests) -> None: """ Test that the get_labels_for_imbalanced_sampler method raises an error for multilabel scalar datasets. """ dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) dataset_contents = """subject,channel,path,label,CAT1 S1,week0,scan1.npy,,A S1,week1,scan2.npy,0|1|2,A S2,week0,scan3.npy,,A S2,week1,scan4.npy,1|2,A S3,week0,scan1.npy,,A S3,week1,scan3.npy,1,A """ config = ScalarModelBase( local_dataset=dataset_folder, class_names=["class0", "class1", "class2", "class3"], label_channels=["week1"], label_value_column="label", non_image_feature_channels=["week0", "week1"], should_validate=False ) config.set_output_to(test_output_dirs.root_dir) train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str)) with pytest.raises(NotImplementedError) as ex: train_dataset.get_labels_for_imbalanced_sampler() assert "ImbalancedSampler is not supported for multilabel tasks." in str(ex)
def test_get_class_counts_multilabel(test_output_dirs: OutputFolderForTests) -> None: """ Test the get_class_counts method for multilabel scalar datasets. """ dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) dataset_contents = """subject,channel,path,label,CAT1 S1,week0,scan1.npy,,A S1,week1,scan2.npy,0|1|2,A S2,week0,scan3.npy,,A S2,week1,scan4.npy,1|2,A S3,week0,scan1.npy,,A S3,week1,scan3.npy,1,A """ config = ScalarModelBase( local_dataset=dataset_folder, class_names=["class0", "class1", "class2", "class3"], label_channels=["week1"], label_value_column="label", non_image_feature_channels=["week0", "week1"], should_validate=False ) config.set_output_to(test_output_dirs.root_dir) train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str)) class_counts = train_dataset.get_class_counts() assert class_counts == {0: 1, 1: 3, 2: 2, 3: 0}
def test_filter_dataset_by_expected_size() -> None: """ Test that we can filter images that do not follow specific size """ classification_config = ScalarModelBase(image_channels=["image"], image_file_column="path", label_channels=["label"], label_value_column="value", non_image_feature_channels={}, numerical_columns=[], traverse_dirs_when_loading=True, expected_column_values=[ ("DIM", "512x49x496") ], local_dataset=Path("fakepath")) data = { 'Subject': ['1', '2', '3', '4'], 'DIM': ["1024x49x496", "512x49x496", "512x49x496", "512x49x496"] } df = pd.DataFrame(data) print(df.head()) filtered = classification_config.filter_dataframe(df) assert filtered.shape == (3, 2) subjects = filtered['Subject'].values assert '1' not in subjects assert '2' in subjects assert '3' in subjects assert '4' in subjects
def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None: super().__init__(config, *args, **kwargs) self.model = config.create_model() raw_loss = model_util.create_scalar_loss_function(config) if isinstance(config, SequenceModelBase): self.loss_fn = lambda model_output, loss: apply_sequence_model_loss( raw_loss, model_output, loss) self.target_indices = config.get_target_indices() self.target_names = [ SequenceMetricsDict.get_hue_name_from_target_index(p) for p in config.sequence_target_positions ] else: self.loss_fn = raw_loss self.target_indices = [] self.target_names = config.class_names self.is_classification_model = config.is_classification_model self.use_mean_teacher_model = config.compute_mean_teacher_model self.is_binary_classification_or_regression = True if len( config.class_names) == 1 else False self.logits_to_posterior_fn = config.get_post_loss_logits_normalization_function( ) self.loss_type = config.loss_type # These two fields store the PyTorch Lightning Metrics objects that will compute metrics on validation # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike) self.train_metric_computers = self.create_metric_computers() self.val_metric_computers = self.create_metric_computers()
def test_get_labels_for_imbalanced_sampler_binary(test_output_dirs: OutputFolderForTests) -> None: """ Test the get_labels_for_imbalanced_sampler method for binary scalar datasets. """ dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) dataset_contents = """subject,channel,path,label,numerical1,numerical2,CAT1 S1,week0,scan1.npy,,1,10,A S1,week1,scan2.npy,True,2,20,A S2,week0,scan3.npy,,3,30,A S2,week1,scan4.npy,False,4,40,A S3,week0,scan1.npy,,5,50,A S3,week1,scan3.npy,True,6,60,A """ config = ScalarModelBase( local_dataset=dataset_folder, label_channels=["week1"], label_value_column="label", non_image_feature_channels=["week0", "week1"], numerical_columns=["numerical1", "numerical2"], should_validate=False ) config.set_output_to(test_output_dirs.root_dir) train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str)) labels = train_dataset.get_labels_for_imbalanced_sampler() assert labels == [1.0, 0.0, 1.0]
def __init__(self, model: Union[DeviceAwareModule, torch.nn.DataParallel], config: ScalarModelBase) -> None: """ :param model: The model to analyse :param config: The ScalarModelBase config defining the parameters of this model. """ self.total_num_categorical_features = config.get_total_number_of_categorical_non_imaging_features( ) self.total_number_of_numerical_non_imaging_features = \ config.get_total_number_of_numerical_non_imaging_features() self.is_non_imaging_model = config.is_non_imaging_model if self.is_non_imaging_model: super().__init__(model, config=config, target_layer=None) else: if isinstance(model, torch.nn.DataParallel): _model: DeviceAwareModule = model.module # type: ignore target_layer = _model.get_last_encoder_layer_names() self.conv_in_3d = bool(_model.conv_in_3d) else: target_layer = model.get_last_encoder_layer_names() self.conv_in_3d = bool(model.conv_in_3d) super().__init__(model=model, config=config, target_layer=target_layer) self.gradients: Dict = {} self.activations: Dict = {}
def load_data_sources_as_per_config(data_frame: pd.DataFrame, args: ScalarModelBase) -> List[T]: """ Loads dataset items from the given dataframe, where all column and channel configurations are taken from their respective model config elements. :param data_frame: The dataframe to read dataset items from. :param args: The model configuration object. :return: A list of all dataset items that could be read from the dataframe. """ # create a one hot encoder if non provided if args.categorical_columns and not args.categorical_feature_encoder: raise ValueError(f"One hot encoder not found to handle categorical_columns={args.categorical_columns}") if args.categorical_feature_encoder is not None: assert isinstance(args.categorical_feature_encoder, CategoricalToOneHotEncoder) # mypy sequence_column = None if isinstance(args, SequenceModelBase): sequence_column = args.sequence_column return DataSourceReader[T]( data_frame=data_frame, image_channels=args.image_channels, image_file_column=args.image_file_column, label_channels=args.label_channels, label_value_column=args.label_value_column, transform_labels=args.get_label_transform(), non_image_feature_channels=args.get_non_image_feature_channels_dict(), numerical_columns=args.numerical_columns, categorical_data_encoder=args.categorical_feature_encoder, sequence_column=sequence_column, subject_column=args.subject_column, channel_column=args.channel_column, is_classification_dataset=args.is_classification_model ).load_data_sources(num_dataset_reader_workers=args.num_dataset_reader_workers)
def test_get_unique_label_combinations_single_label( test_output_dirs: OutputFolderForTests) -> None: config = ScalarModelBase(label_channels=["label"], label_value_column="value", image_channels=["image"], image_file_column="path", subject_column="subjectID") class_names = config.class_names config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv = config.local_dataset / "dataset.csv" dataset_csv.write_text("subjectID,channel,path,value\n" "S1,label,random,1\n" "S1,image,random,\n" "S2,label,random,0\n" "S2,image,random,\n" "S3,label,random,1\n" "S3,image,random,\n") unique_labels = get_unique_prediction_target_combinations( config) # type: ignore expected_label_combinations = set( frozenset(class_names[i] for i in labels) # type: ignore for labels in [[], [0]]) assert unique_labels == expected_label_combinations
def test_get_class_weights_dataset(test_output_dirs: OutputFolderForTests) -> None: """ Test training and testing of sequence models that predicts at multiple time points, when it is started via run_ml. """ dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) dataset_contents = """subject,channel,path,label,numerical1,numerical2,CAT1 S1,week0,scan1.npy,,1,10,A S1,week1,scan2.npy,True,2,20,A S2,week0,scan3.npy,,3,30,A S2,week1,scan4.npy,False,4,40,A S3,week0,scan1.npy,,5,50,A S3,week1,scan3.npy,True,6,60,A """ config = ScalarModelBase( local_dataset=dataset_folder, label_channels=["week1"], label_value_column="label", non_image_feature_channels=["week0", "week1"], numerical_columns=["numerical1", "numerical2"], should_validate=False ) config.set_output_to(test_output_dirs.root_dir) train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str)) class_counts = train_dataset.get_class_counts() assert class_counts == {0.0: 1, 1.0: 2}
def test_image_labels_from_subject_id_single( test_output_dirs: OutputFolderForTests) -> None: config = ScalarModelBase(label_value_column="label", subject_column="subject") config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv = config.local_dataset / "dataset.csv" dataset_csv.write_text("subject,channel,label\n" "0,label,0\n" "1,label,1\n") df = config.read_dataset_if_needed() dataset = ScalarDataset(args=config, data_frame=df) labels = get_image_labels_from_subject_id(subject_id="0", dataset=dataset, config=config) assert not labels labels = get_image_labels_from_subject_id(subject_id="1", dataset=dataset, config=config) assert labels assert len(labels) == 1 assert labels[0] == MetricsDict.DEFAULT_HUE_KEY
def test_generate_classification_multilabel_report(test_output_dirs: OutputFolderForTests) -> None: hues = ["Hue1", "Hue2"] config = ScalarModelBase(label_value_column="label", image_file_column="filePath", image_channels=["image1", "image2"], label_channels=["image1"]) config.class_names = hues test_metrics_file = test_output_dirs.root_dir / "test_metrics_classification.csv" val_metrics_file = test_output_dirs.root_dir / "val_metrics_classification.csv" config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv_path = config.local_dataset / "dataset.csv" image_file_name = "image.npy" pd.DataFrame.from_dict({LoggingColumns.Hue.value: [hues[0], hues[1]] * 6, LoggingColumns.Epoch.value: [0] * 12, LoggingColumns.Patient.value: [s for s in range(6) for _ in range(2)], LoggingColumns.ModelOutput.value: [0.1, 0.1, 0.1, 0.9, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.1], LoggingColumns.Label.value: [0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0], LoggingColumns.CrossValidationSplitIndex: [DEFAULT_CROSS_VALIDATION_SPLIT_INDEX] * 12, LoggingColumns.DataSplit.value: [0] * 12, }).to_csv(test_metrics_file, index=False) pd.DataFrame.from_dict({LoggingColumns.Hue.value: [hues[0], hues[1]] * 6, LoggingColumns.Epoch.value: [0] * 12, LoggingColumns.Patient.value: [s for s in range(6) for _ in range(2)], LoggingColumns.ModelOutput.value: [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.1, 0.9, 0.1], LoggingColumns.Label.value: [0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0], LoggingColumns.CrossValidationSplitIndex: [DEFAULT_CROSS_VALIDATION_SPLIT_INDEX] * 12, LoggingColumns.DataSplit.value: [0] * 12, }).to_csv(val_metrics_file, index=False) pd.DataFrame.from_dict({config.subject_column: [s for s in range(6) for _ in range(2)], config.channel_column: ["image1", "image2"] * 6, config.image_file_column: [f for f in [f"0_{image_file_name}", f"1_{image_file_name}"] for _ in range(6)], config.label_value_column: ["", "", "1", "1", "1", "1", "0|1", "0|1", "0|1", "0|1", "0", "0"] }).to_csv(dataset_csv_path, index=False) np.save(str(Path(config.local_dataset / f"0_{image_file_name}")), np.random.randint(0, 255, [5, 4])) np.save(str(Path(config.local_dataset / f"1_{image_file_name}")), np.random.randint(0, 255, [5, 4])) result_file = test_output_dirs.root_dir / "report.ipynb" result_html = generate_classification_multilabel_notebook(result_notebook=result_file, config=config, val_metrics=val_metrics_file, test_metrics=test_metrics_file) assert result_file.is_file() assert result_html.is_file() assert result_html.suffix == ".html"
def test_get_metrics_table_single_run() -> None: reports_folder = Path(__file__).parent test_metrics_file = reports_folder / "test_metrics_classification.csv" val_metrics_file = reports_folder / "val_metrics_classification.csv" config = ScalarModelBase(label_value_column="label", image_file_column="filePath", subject_column="subject") rows, header = get_metrics_table_for_prediction_target( csv_to_set_optimal_threshold=val_metrics_file, data_split_to_set_optimal_threshold=ModelExecutionMode.VAL, csv_to_compute_metrics=test_metrics_file, data_split_to_compute_metrics=ModelExecutionMode.TEST, config=config, prediction_target=MetricsDict.DEFAULT_HUE_KEY, is_thresholded=False, is_crossval_report=False) expected_header = "Metric Value".split('\t') expected_rows = [ f"{ReportedScalarMetrics.AUC_PR.value[0]} 0.5417".split('\t'), f"{ReportedScalarMetrics.AUC_ROC.value[0]} 0.5000".split('\t'), f"{ReportedScalarMetrics.OptimalThreshold.value[0]} 0.6000".split( '\t'), f"{ReportedScalarMetrics.AccuracyAtOptimalThreshold.value[0]} 0.5000". split('\t'), f"{ReportedScalarMetrics.AccuracyAtThreshold05.value[0]} 0.5000".split( '\t'), f"{ReportedScalarMetrics.Sensitivity.value[0]} 0.5000".split('\t'), f"{ReportedScalarMetrics.Specificity.value[0]} 0.5000".split('\t'), ] check_table_equality(header, rows, expected_header, expected_rows)
def get_unique_prediction_target_combinations( config: ScalarModelBase) -> Set[FrozenSet[str]]: """ Get a list of all the combinations of labels that exist in the dataset. For multilabel classification tasks, this function will return all unique combinations of labels that occur in the dataset csv. For example, if there are 6 samples in the dataset with the following ground truth labels Sample1: class1, class2 Sample2: class0 Sample3: class1 Sample4: class2, class3 Sample5: (all label classes are negative in Sample 5) Sample6: class1, class2 This function will return {{"class1", "class2"}, {"class0"}, {"class1"}, {"class2", "class3"}, {}} For binary classification tasks (assume class_names has not been changed from ["Default"]): This function will return a set with two members - {{"Default"}, {}} if there is at least one positive example in the dataset. If there are no positive examples, it returns {{}}. """ df = config.read_dataset_if_needed() dataset = ScalarDataset(args=config, data_frame=df) all_labels = [ torch.flatten(torch.nonzero(item.label)).tolist() for item in dataset.items ] label_set = set( frozenset([config.class_names[i] for i in labels if not math.isnan(i)]) for labels in all_labels) return label_set
def test_imbalanced_sampler() -> None: # Simulate a highly imbalanced dataset with only one data point # with a negative label. csv_string = StringIO("""subject,channel,value,scalar1 S1,label,True,1.0 S2,label,True,1.0 S3,label,True,1.0 S4,label,True,1.0 S5,label,True,1.0 S6,label,False,1.0 """) torch.manual_seed(0) df = pd.read_csv(csv_string, sep=",", dtype=str) args = ScalarModelBase(label_value_column="value", numerical_columns=["scalar1"], local_dataset=Path("fakepath")) dataset = ScalarDataset(args, data_frame=df) drawn_subjects = [] for _ in range(10): data_loader = dataset.as_data_loader(use_imbalanced_sampler=True, shuffle=True, batch_size=6, num_dataload_workers=0) for batch in data_loader: drawn_subjects.extend([i.id.strip() for i in batch["metadata"]]) counts_per_subjects = Counter(drawn_subjects) count_negative_subjects = counts_per_subjects["S6"] assert count_negative_subjects / float(len(drawn_subjects)) > 0.3
def test_get_metrics_table_crossval() -> None: reports_folder = Path(__file__).parent crossval_metrics_file = reports_folder / "crossval_metrics_classification.csv" config = ScalarModelBase(label_value_column="label", image_file_column="filePath", subject_column="subject", number_of_cross_validation_splits=3) rows, header = get_metrics_table_for_prediction_target( csv_to_set_optimal_threshold=crossval_metrics_file, data_split_to_set_optimal_threshold=ModelExecutionMode.VAL, csv_to_compute_metrics=crossval_metrics_file, data_split_to_compute_metrics=ModelExecutionMode.TEST, config=config, prediction_target=MetricsDict.DEFAULT_HUE_KEY, is_thresholded=False, is_crossval_report=True) expected_header = "Metric Split 0 Split 1 Split 2 Mean (std)".split('\t') expected_rows = [ f"{ReportedScalarMetrics.AUC_PR.value[0]} 0.5417 0.4481 0.6889 0.5595 (0.0991)" .split('\t'), f"{ReportedScalarMetrics.AUC_ROC.value[0]} 0.5000 0.2778 0.7222 0.5000 (0.1814)" .split('\t'), f"{ReportedScalarMetrics.OptimalThreshold.value[0]} 0.6000 0.6000 0.6000 0.6000 (0.0000)" .split('\t'), f"{ReportedScalarMetrics.AccuracyAtOptimalThreshold.value[0]} 0.5000 0.2500 0.7500 0.5000 (0.2041)" .split('\t'), f"{ReportedScalarMetrics.AccuracyAtThreshold05.value[0]} 0.5000 0.1667 0.8333 0.5000 (0.2722)" .split('\t'), f"{ReportedScalarMetrics.Sensitivity.value[0]} 0.5000 0.1667 0.8333 0.5000 (0.2722)" .split('\t'), f"{ReportedScalarMetrics.Specificity.value[0]} 0.5000 0.1667 0.8333 0.5000 (0.2722)" .split('\t') ] check_table_equality(header, rows, expected_header, expected_rows)
def create_scalar_loss_function(config: ScalarModelBase) -> torch.nn.Module: """ Returns a torch module that computes a loss function for classification and regression models. """ if config.loss_type == ScalarLoss.BinaryCrossEntropyWithLogits: return BinaryCrossEntropyWithLogitsLoss(num_classes=len(config.class_names), smoothing_eps=config.label_smoothing_eps) if config.loss_type == ScalarLoss.WeightedCrossEntropyWithLogits: return BinaryCrossEntropyWithLogitsLoss( num_classes=len(config.class_names), smoothing_eps=config.label_smoothing_eps, class_counts=config.get_training_class_counts(), num_train_samples=config.get_total_number_of_training_samples()) elif config.loss_type == ScalarLoss.MeanSquaredError: return MSELoss() else: raise NotImplementedError(f"Loss type {config.loss_type} is not implemented")
def _create_test_dataset(csv_path: Path, scalar_loss: ScalarLoss = ScalarLoss.BinaryCrossEntropyWithLogits, categorical_columns: Optional[List[str]] = None) -> ScalarDataset: # Load items indirectly via a ScalarDataset object, to see if the wiring up of all column names works args = ScalarModelBase(image_channels=["image"], image_file_column="path", label_channels=["label"], label_value_column="value", non_image_feature_channels=["label"], numerical_columns=["scalar1", "scalar2"], categorical_columns=categorical_columns or list(), subject_column="USUBJID", channel_column="week", local_dataset=csv_path, should_validate=False, loss_type=scalar_loss, num_dataload_workers=0) args.read_dataset_into_dataframe_and_pre_process() return ScalarDataset(args)
def test_filter_dataset_with_empty_list(expected_column_value: List[Tuple[str, str]]) -> None: """ Test that empty filter has no effect """ classification_config = ScalarModelBase(image_channels=["image"], image_file_column="path", label_channels=["label"], label_value_column="value", non_image_feature_channels={}, numerical_columns=[], traverse_dirs_when_loading=True, expected_column_values=[], local_dataset=Path("fakepath")) data = {'Subject': ['1', '2', '3', '4'], 'DIM': ["1024x49x496", "512x49x496", "512x49x496", "512x49x496"]} df = pd.DataFrame(data) print(df.head()) filtered = classification_config.filter_dataframe(df) assert_frame_equal(df, filtered)
def test_dataset_reader_workers() -> None: """ Test to make sure the number of dataset reader workers are set correctly """ config = ScalarModelBase(should_validate=False, num_dataset_reader_workers=-1) if config.is_offline_run: assert config.num_dataset_reader_workers == -1 else: assert config.num_dataset_reader_workers == 0
def test_generate_classification_crossval_report( test_output_dirs: OutputFolderForTests) -> None: config = ScalarModelBase(label_value_column="label", image_file_column="filePath", subject_column="subject", number_of_cross_validation_splits=3) generate_crossval_notebook( config, metrics_file="crossval_metrics_classification.csv", temp_folder=test_output_dirs.root_dir)
def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None: super().__init__(config, *args, **kwargs) self.model = config.create_model() raw_loss = model_util.create_scalar_loss_function(config) self.loss_fn = raw_loss self.target_names = config.target_names self.is_classification_model = config.is_classification_model self.use_mean_teacher_model = config.compute_mean_teacher_model self.is_binary_classification_or_regression = True if len( config.class_names) == 1 else False self.logits_to_posterior_fn = config.get_post_loss_logits_normalization_function( ) self.loss_type = config.loss_type # These two fields store the PyTorch Lightning Metrics objects that will compute metrics on validation # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike) self.train_metric_computers = config.create_metric_computers() self.val_metric_computers = config.create_metric_computers() self.compute_and_log_metrics = config.compute_and_log_metrics
def test_generate_classification_report( test_output_dirs: OutputFolderForTests) -> None: reports_folder = Path(__file__).parent test_metrics_file = reports_folder / "test_metrics_classification.csv" val_metrics_file = reports_folder / "val_metrics_classification.csv" config = ScalarModelBase(label_value_column="label", image_file_column="filePath", subject_column="subject") config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv = config.local_dataset / "dataset.csv" image_file_name = "image.npy" dataset_csv.write_text("subject,filePath,label\n" f"0,0_{image_file_name},0\n" f"1,1_{image_file_name},0\n" f"2,0_{image_file_name},0\n" f"3,1_{image_file_name},0\n" f"4,0_{image_file_name},0\n" f"5,1_{image_file_name},0\n" f"6,0_{image_file_name},0\n" f"7,1_{image_file_name},0\n" f"8,0_{image_file_name},0\n" f"9,1_{image_file_name},0\n" f"10,0_{image_file_name},0\n" f"11,1_{image_file_name},0\n") np.save(str(Path(config.local_dataset / f"0_{image_file_name}")), np.random.randint(0, 255, [5, 4])) np.save(str(Path(config.local_dataset / f"1_{image_file_name}")), np.random.randint(0, 255, [5, 4])) result_file = test_output_dirs.root_dir / "report.ipynb" result_html = generate_classification_notebook( result_notebook=result_file, config=config, val_metrics=val_metrics_file, test_metrics=test_metrics_file) assert result_file.is_file() assert result_html.is_file() assert result_html.suffix == ".html"
def test_image_labels_from_subject_id_multiple(test_output_dirs: OutputFolderForTests) -> None: config = ScalarModelBase(label_channels=["label"], label_value_column="label", subject_column="subject", class_names=["class1", "class2", "class3"]) config.local_dataset = test_output_dirs.root_dir / "dataset" config.local_dataset.mkdir() dataset_csv = config.local_dataset / "dataset.csv" dataset_csv.write_text("subject,channel,label\n" "0,label,0\n" "0,image,\n" "1,label,1|2\n" "1,image,\n") df = config.read_dataset_if_needed() dataset = ScalarDataset(args=config, data_frame=df) labels = get_image_labels_from_subject_id(subject_id="1", dataset=dataset, config=config) assert labels assert len(labels) == 2 assert set(labels) == {config.class_names[1], config.class_names[2]}
def test_dataset_csv_with_ScalarModelBase( test_output_dirs: OutputFolderForTests) -> None: dataset_csv_path = create_dataset_csv(test_output_dirs) model_config = ScalarModelBase(should_validate=False) model_config.local_dataset = dataset_csv_path.parent model_config.dataset_csv = dataset_csv_path.name model_config.read_dataset_into_dataframe_and_pre_process() assert model_config.dataset_data_frame is not None validate_dataset_paths(model_config)
def test_dataset_normalize_image( test_output_dirs: OutputFolderForTests) -> None: """ Test dataset loading with window normalization image processing. """ source_folder = str(full_ml_test_data_path() / "classification_data") target_folder = str(Path(test_output_dirs.make_sub_dir("foo")) / "bar") shutil.copytree(source_folder, target_folder) csv_string = StringIO("""subject,channel,path,value,scalar1 S1,image,4be9beed-5861-fdd2-72c2-8dd89aadc1ef S1,label,,True,1.0 S2,image,6ceacaf8-abd2-ffec-2ade-d52afd6dd1be S2,label,,True,2.0 S3,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4 S3,label,,False,3.0 S4,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4 S4,label,,False,3.0 """) df = pd.read_csv(csv_string, sep=",", dtype=str) args = ScalarModelBase(image_channels=["image"], image_file_column="path", label_channels=["label"], label_value_column="value", non_image_feature_channels={}, numerical_columns=[], traverse_dirs_when_loading=True, local_dataset=test_output_dirs.root_dir) raw_dataset = ScalarDataset(args, data_frame=df) normalized = ScalarDataset( args, data_frame=df, sample_transforms=WindowNormalizationForScalarItem()) assert len(raw_dataset) == 4 for i in range(4): raw_item = raw_dataset[i] normalized_item = normalized[i] normalized_images = normalized_item["images"] assert isinstance(raw_item, dict) expected_normalized_images = torch.tensor( mri_window(raw_item["images"].numpy(), mask=None, output_range=(0, 1))[0]) assert normalized_images is not None assert torch.is_tensor(normalized_images) assert expected_normalized_images.shape == normalized_images.shape expected_image_size = (4, 5, 7) assert normalized_images.shape == (1, ) + expected_image_size assert torch.all(expected_normalized_images == normalized_images)
def test_generate_classification_crossval_report( test_output_dirs: OutputFolderForTests) -> None: reports_folder = Path(__file__).parent crossval_metrics_file = reports_folder / "crossval_metrics_classification.csv" config = ScalarModelBase(label_value_column="label", image_file_column="filePath", subject_column="subject", number_of_cross_validation_splits=3) result_file = test_output_dirs.root_dir / "report.ipynb" result_html = generate_classification_crossval_notebook( result_notebook=result_file, config=config, crossval_metrics=crossval_metrics_file) assert result_file.is_file() assert result_html.is_file() assert result_html.suffix == ".html"
def test_get_total_number_of_cross_validation_runs(number_of_cross_validation_splits_per_fold: int) -> None: config = ScalarModelBase(should_validate=False) config.number_of_cross_validation_splits = 2 config.number_of_cross_validation_splits_per_fold = number_of_cross_validation_splits_per_fold assert config.perform_cross_validation if number_of_cross_validation_splits_per_fold > 0: assert config.perform_sub_fold_cross_validation assert config.get_total_number_of_cross_validation_runs() \ == config.number_of_cross_validation_splits * number_of_cross_validation_splits_per_fold else: assert not config.perform_sub_fold_cross_validation assert config.get_total_number_of_cross_validation_runs() == config.number_of_cross_validation_splits
def get_scalar_model_inputs_and_labels( model_config: ScalarModelBase, model: torch.nn.Module, sample: Dict[str, Any]) -> ScalarModelInputsAndLabels: """ For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader. :param model_config: The configuration object for the model. :param model: The instantiated PyTorch model. :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value) :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors, label tensor, subject IDs, and the data item reconstructed from the data loader output """ if isinstance(model, DataParallelModel): model = model.get_module() if isinstance(model_config, SequenceModelBase): sequence_model: DeviceAwareModule[List[ClassificationItemSequence], torch.Tensor] = model # type: ignore sequences = ClassificationItemSequence.from_minibatch(sample) subject_ids = [x.id for x in sequences] labels = ClassificationItemSequence.create_labels_tensor_for_minibatch( sequences=sequences, target_indices=model_config.get_target_indices()) model_inputs = sequence_model.get_input_tensors(sequences) return ScalarModelInputsAndLabels[List[ClassificationItemSequence], torch.Tensor]( model_inputs=model_inputs, labels=labels, subject_ids=subject_ids, data_item=sequences) else: scalar_model: DeviceAwareModule[ScalarItem, torch.Tensor] = model # type: ignore scalar_item = ScalarItem.from_dict(sample) subject_ids = [str(x.id) for x in scalar_item.metadata] # type: ignore model_inputs = scalar_model.get_input_tensors(scalar_item) return ScalarModelInputsAndLabels[ScalarItem, torch.Tensor]( model_inputs=model_inputs, labels=scalar_item.label, subject_ids=subject_ids, data_item=scalar_item)