def test_get_image_filepath_from_subject_id_single(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ScalarModelBase(image_file_column="filePath",
                             label_value_column="label",
                             subject_column="subject")

    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    image_file_name = "image.npy"
    dataset_csv.write_text(f"subject,filePath,label\n"
                           f"0,0_{image_file_name},0\n"
                           f"1,1_{image_file_name},1\n")

    df = config.read_dataset_if_needed()
    dataset = ScalarDataset(args=config, data_frame=df)

    Path(config.local_dataset / f"0_{image_file_name}").touch()
    Path(config.local_dataset / f"1_{image_file_name}").touch()

    filepath = get_image_filepath_from_subject_id(subject_id="1",
                                                  dataset=dataset,
                                                  config=config)
    expected_path = Path(config.local_dataset / f"1_{image_file_name}")

    assert filepath
    assert len(filepath) == 1
    assert expected_path.samefile(filepath[0])

    # Check error is raised if the subject does not exist
    with pytest.raises(ValueError) as ex:
        get_image_filepath_from_subject_id(subject_id="100",
                                           dataset=dataset,
                                           config=config)
    assert "Could not find subject" in str(ex)
def test_get_image_filepath_from_subject_id_with_image_channels(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ScalarModelBase(label_channels=["label"],
                             image_file_column="filePath",
                             label_value_column="label",
                             image_channels=["image"],
                             subject_column="subject")

    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    image_file_name = "image.npy"
    dataset_csv.write_text(f"subject,channel,filePath,label\n"
                           f"0,label,,0\n"
                           f"0,image,0_{image_file_name},\n"
                           f"1,label,,1\n"
                           f"1,image,1_{image_file_name},\n")

    df = config.read_dataset_if_needed()
    dataset = ScalarDataset(args=config, data_frame=df)

    Path(config.local_dataset / f"0_{image_file_name}").touch()
    Path(config.local_dataset / f"1_{image_file_name}").touch()

    filepath = get_image_filepath_from_subject_id(subject_id="1",
                                                  dataset=dataset,
                                                  config=config)
    expected_path = Path(config.local_dataset / f"1_{image_file_name}")

    assert filepath
    assert len(filepath) == 1
    assert filepath[0].samefile(expected_path)
Example #3
0
def test_get_labels_for_imbalanced_sampler_multilabel(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that the get_labels_for_imbalanced_sampler method raises an error for multilabel scalar datasets.
    """
    dataset_folder = Path(test_output_dirs.make_sub_dir("dataset"))
    dataset_contents = """subject,channel,path,label,CAT1
    S1,week0,scan1.npy,,A
    S1,week1,scan2.npy,0|1|2,A
    S2,week0,scan3.npy,,A
    S2,week1,scan4.npy,1|2,A
    S3,week0,scan1.npy,,A
    S3,week1,scan3.npy,1,A
    """
    config = ScalarModelBase(
        local_dataset=dataset_folder,
        class_names=["class0", "class1", "class2", "class3"],
        label_channels=["week1"],
        label_value_column="label",
        non_image_feature_channels=["week0", "week1"],
        should_validate=False
    )
    config.set_output_to(test_output_dirs.root_dir)
    train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str))
    with pytest.raises(NotImplementedError) as ex:
        train_dataset.get_labels_for_imbalanced_sampler()
    assert "ImbalancedSampler is not supported for multilabel tasks." in str(ex)
Example #4
0
def test_get_class_counts_multilabel(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test the get_class_counts method for multilabel scalar datasets.
    """
    dataset_folder = Path(test_output_dirs.make_sub_dir("dataset"))
    dataset_contents = """subject,channel,path,label,CAT1
   S1,week0,scan1.npy,,A
   S1,week1,scan2.npy,0|1|2,A
   S2,week0,scan3.npy,,A
   S2,week1,scan4.npy,1|2,A
   S3,week0,scan1.npy,,A
   S3,week1,scan3.npy,1,A
   """
    config = ScalarModelBase(
        local_dataset=dataset_folder,
        class_names=["class0", "class1", "class2", "class3"],
        label_channels=["week1"],
        label_value_column="label",
        non_image_feature_channels=["week0", "week1"],
        should_validate=False
    )
    config.set_output_to(test_output_dirs.root_dir)
    train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str))
    class_counts = train_dataset.get_class_counts()
    assert class_counts == {0: 1, 1: 3, 2: 2, 3: 0}
def test_filter_dataset_by_expected_size() -> None:
    """
    Test that we can filter images that do not follow specific size
    """

    classification_config = ScalarModelBase(image_channels=["image"],
                                            image_file_column="path",
                                            label_channels=["label"],
                                            label_value_column="value",
                                            non_image_feature_channels={},
                                            numerical_columns=[],
                                            traverse_dirs_when_loading=True,
                                            expected_column_values=[
                                                ("DIM", "512x49x496")
                                            ],
                                            local_dataset=Path("fakepath"))
    data = {
        'Subject': ['1', '2', '3', '4'],
        'DIM': ["1024x49x496", "512x49x496", "512x49x496", "512x49x496"]
    }
    df = pd.DataFrame(data)
    print(df.head())
    filtered = classification_config.filter_dataframe(df)
    assert filtered.shape == (3, 2)
    subjects = filtered['Subject'].values
    assert '1' not in subjects
    assert '2' in subjects
    assert '3' in subjects
    assert '4' in subjects
Example #6
0
 def __init__(self, config: ScalarModelBase, *args: Any,
              **kwargs: Any) -> None:
     super().__init__(config, *args, **kwargs)
     self.model = config.create_model()
     raw_loss = model_util.create_scalar_loss_function(config)
     if isinstance(config, SequenceModelBase):
         self.loss_fn = lambda model_output, loss: apply_sequence_model_loss(
             raw_loss, model_output, loss)
         self.target_indices = config.get_target_indices()
         self.target_names = [
             SequenceMetricsDict.get_hue_name_from_target_index(p)
             for p in config.sequence_target_positions
         ]
     else:
         self.loss_fn = raw_loss
         self.target_indices = []
         self.target_names = config.class_names
     self.is_classification_model = config.is_classification_model
     self.use_mean_teacher_model = config.compute_mean_teacher_model
     self.is_binary_classification_or_regression = True if len(
         config.class_names) == 1 else False
     self.logits_to_posterior_fn = config.get_post_loss_logits_normalization_function(
     )
     self.loss_type = config.loss_type
     # These two fields store the PyTorch Lightning Metrics objects that will compute metrics on validation
     # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike)
     self.train_metric_computers = self.create_metric_computers()
     self.val_metric_computers = self.create_metric_computers()
Example #7
0
def test_get_labels_for_imbalanced_sampler_binary(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test the get_labels_for_imbalanced_sampler method for binary scalar datasets.
    """
    dataset_folder = Path(test_output_dirs.make_sub_dir("dataset"))
    dataset_contents = """subject,channel,path,label,numerical1,numerical2,CAT1
    S1,week0,scan1.npy,,1,10,A
    S1,week1,scan2.npy,True,2,20,A
    S2,week0,scan3.npy,,3,30,A
    S2,week1,scan4.npy,False,4,40,A
    S3,week0,scan1.npy,,5,50,A
    S3,week1,scan3.npy,True,6,60,A
    """
    config = ScalarModelBase(
        local_dataset=dataset_folder,
        label_channels=["week1"],
        label_value_column="label",
        non_image_feature_channels=["week0", "week1"],
        numerical_columns=["numerical1", "numerical2"],
        should_validate=False
    )
    config.set_output_to(test_output_dirs.root_dir)
    train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str))
    labels = train_dataset.get_labels_for_imbalanced_sampler()
    assert labels == [1.0, 0.0, 1.0]
Example #8
0
    def __init__(self, model: Union[DeviceAwareModule, torch.nn.DataParallel],
                 config: ScalarModelBase) -> None:
        """

        :param model: The model to analyse
        :param config: The ScalarModelBase config defining the parameters of this model.
        """
        self.total_num_categorical_features = config.get_total_number_of_categorical_non_imaging_features(
        )
        self.total_number_of_numerical_non_imaging_features = \
            config.get_total_number_of_numerical_non_imaging_features()
        self.is_non_imaging_model = config.is_non_imaging_model
        if self.is_non_imaging_model:
            super().__init__(model, config=config, target_layer=None)
        else:
            if isinstance(model, torch.nn.DataParallel):
                _model: DeviceAwareModule = model.module  # type: ignore
                target_layer = _model.get_last_encoder_layer_names()
                self.conv_in_3d = bool(_model.conv_in_3d)
            else:
                target_layer = model.get_last_encoder_layer_names()
                self.conv_in_3d = bool(model.conv_in_3d)
            super().__init__(model=model,
                             config=config,
                             target_layer=target_layer)
        self.gradients: Dict = {}
        self.activations: Dict = {}
Example #9
0
    def load_data_sources_as_per_config(data_frame: pd.DataFrame,
                                        args: ScalarModelBase) -> List[T]:
        """
        Loads dataset items from the given dataframe, where all column and channel configurations are taken from their
        respective model config elements.
        :param data_frame: The dataframe to read dataset items from.
        :param args: The model configuration object.
        :return: A list of all dataset items that could be read from the dataframe.
        """
        # create a one hot encoder if non provided
        if args.categorical_columns and not args.categorical_feature_encoder:
            raise ValueError(f"One hot encoder not found to handle categorical_columns={args.categorical_columns}")

        if args.categorical_feature_encoder is not None:
            assert isinstance(args.categorical_feature_encoder, CategoricalToOneHotEncoder)  # mypy

        sequence_column = None
        if isinstance(args, SequenceModelBase):
            sequence_column = args.sequence_column

        return DataSourceReader[T](
            data_frame=data_frame,
            image_channels=args.image_channels,
            image_file_column=args.image_file_column,
            label_channels=args.label_channels,
            label_value_column=args.label_value_column,
            transform_labels=args.get_label_transform(),
            non_image_feature_channels=args.get_non_image_feature_channels_dict(),
            numerical_columns=args.numerical_columns,
            categorical_data_encoder=args.categorical_feature_encoder,
            sequence_column=sequence_column,
            subject_column=args.subject_column,
            channel_column=args.channel_column,
            is_classification_dataset=args.is_classification_model
        ).load_data_sources(num_dataset_reader_workers=args.num_dataset_reader_workers)
Example #10
0
def test_get_unique_label_combinations_single_label(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ScalarModelBase(label_channels=["label"],
                             label_value_column="value",
                             image_channels=["image"],
                             image_file_column="path",
                             subject_column="subjectID")
    class_names = config.class_names

    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    dataset_csv.write_text("subjectID,channel,path,value\n"
                           "S1,label,random,1\n"
                           "S1,image,random,\n"
                           "S2,label,random,0\n"
                           "S2,image,random,\n"
                           "S3,label,random,1\n"
                           "S3,image,random,\n")

    unique_labels = get_unique_prediction_target_combinations(
        config)  # type: ignore
    expected_label_combinations = set(
        frozenset(class_names[i] for i in labels)  # type: ignore
        for labels in [[], [0]])
    assert unique_labels == expected_label_combinations
Example #11
0
def test_get_class_weights_dataset(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of sequence models that predicts at multiple time points,
    when it is started via run_ml.
    """
    dataset_folder = Path(test_output_dirs.make_sub_dir("dataset"))
    dataset_contents = """subject,channel,path,label,numerical1,numerical2,CAT1
   S1,week0,scan1.npy,,1,10,A
   S1,week1,scan2.npy,True,2,20,A
   S2,week0,scan3.npy,,3,30,A
   S2,week1,scan4.npy,False,4,40,A
   S3,week0,scan1.npy,,5,50,A
   S3,week1,scan3.npy,True,6,60,A
   """
    config = ScalarModelBase(
        local_dataset=dataset_folder,
        label_channels=["week1"],
        label_value_column="label",
        non_image_feature_channels=["week0", "week1"],
        numerical_columns=["numerical1", "numerical2"],
        should_validate=False
    )
    config.set_output_to(test_output_dirs.root_dir)
    train_dataset = ScalarDataset(config, pd.read_csv(StringIO(dataset_contents), dtype=str))
    class_counts = train_dataset.get_class_counts()
    assert class_counts == {0.0: 1, 1.0: 2}
def test_image_labels_from_subject_id_single(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ScalarModelBase(label_value_column="label",
                             subject_column="subject")

    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    dataset_csv.write_text("subject,channel,label\n"
                           "0,label,0\n"
                           "1,label,1\n")

    df = config.read_dataset_if_needed()
    dataset = ScalarDataset(args=config, data_frame=df)

    labels = get_image_labels_from_subject_id(subject_id="0",
                                              dataset=dataset,
                                              config=config)
    assert not labels

    labels = get_image_labels_from_subject_id(subject_id="1",
                                              dataset=dataset,
                                              config=config)
    assert labels
    assert len(labels) == 1
    assert labels[0] == MetricsDict.DEFAULT_HUE_KEY
Example #13
0
def test_generate_classification_multilabel_report(test_output_dirs: OutputFolderForTests) -> None:
    hues = ["Hue1", "Hue2"]

    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             image_channels=["image1", "image2"],
                             label_channels=["image1"])
    config.class_names = hues

    test_metrics_file = test_output_dirs.root_dir / "test_metrics_classification.csv"
    val_metrics_file = test_output_dirs.root_dir / "val_metrics_classification.csv"

    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv_path = config.local_dataset / "dataset.csv"
    image_file_name = "image.npy"

    pd.DataFrame.from_dict({LoggingColumns.Hue.value: [hues[0], hues[1]] * 6,
                            LoggingColumns.Epoch.value: [0] * 12,
                            LoggingColumns.Patient.value: [s for s in range(6) for _ in range(2)],
                            LoggingColumns.ModelOutput.value: [0.1, 0.1, 0.1, 0.9, 0.1, 0.9,
                                                               0.9, 0.9, 0.9, 0.9, 0.9, 0.1],
                            LoggingColumns.Label.value: [0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0],
                            LoggingColumns.CrossValidationSplitIndex: [DEFAULT_CROSS_VALIDATION_SPLIT_INDEX] * 12,
                            LoggingColumns.DataSplit.value: [0] * 12,
                            }).to_csv(test_metrics_file, index=False)

    pd.DataFrame.from_dict({LoggingColumns.Hue.value: [hues[0], hues[1]] * 6,
                            LoggingColumns.Epoch.value: [0] * 12,
                            LoggingColumns.Patient.value: [s for s in range(6) for _ in range(2)],
                            LoggingColumns.ModelOutput.value: [0.1, 0.1, 0.1, 0.1, 0.1, 0.9,
                                                               0.9, 0.9, 0.9, 0.1, 0.9, 0.1],
                            LoggingColumns.Label.value: [0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0],
                            LoggingColumns.CrossValidationSplitIndex: [DEFAULT_CROSS_VALIDATION_SPLIT_INDEX] * 12,
                            LoggingColumns.DataSplit.value: [0] * 12,
                            }).to_csv(val_metrics_file, index=False)

    pd.DataFrame.from_dict({config.subject_column: [s for s in range(6) for _ in range(2)],
                            config.channel_column: ["image1", "image2"] * 6,
                            config.image_file_column: [f for f in [f"0_{image_file_name}", f"1_{image_file_name}"]
                                                       for _ in range(6)],
                            config.label_value_column: ["", "", "1", "1", "1", "1", "0|1", "0|1", "0|1", "0|1", "0",
                                                        "0"]
                            }).to_csv(dataset_csv_path, index=False)

    np.save(str(Path(config.local_dataset / f"0_{image_file_name}")),
            np.random.randint(0, 255, [5, 4]))
    np.save(str(Path(config.local_dataset / f"1_{image_file_name}")),
            np.random.randint(0, 255, [5, 4]))

    result_file = test_output_dirs.root_dir / "report.ipynb"
    result_html = generate_classification_multilabel_notebook(result_notebook=result_file,
                                                              config=config,
                                                              val_metrics=val_metrics_file,
                                                              test_metrics=test_metrics_file)
    assert result_file.is_file()
    assert result_html.is_file()
    assert result_html.suffix == ".html"
def test_get_metrics_table_single_run() -> None:
    reports_folder = Path(__file__).parent
    test_metrics_file = reports_folder / "test_metrics_classification.csv"
    val_metrics_file = reports_folder / "val_metrics_classification.csv"

    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             subject_column="subject")
    rows, header = get_metrics_table_for_prediction_target(
        csv_to_set_optimal_threshold=val_metrics_file,
        data_split_to_set_optimal_threshold=ModelExecutionMode.VAL,
        csv_to_compute_metrics=test_metrics_file,
        data_split_to_compute_metrics=ModelExecutionMode.TEST,
        config=config,
        prediction_target=MetricsDict.DEFAULT_HUE_KEY,
        is_thresholded=False,
        is_crossval_report=False)
    expected_header = "Metric	Value".split('\t')
    expected_rows = [
        f"{ReportedScalarMetrics.AUC_PR.value[0]}	0.5417".split('\t'),
        f"{ReportedScalarMetrics.AUC_ROC.value[0]}	0.5000".split('\t'),
        f"{ReportedScalarMetrics.OptimalThreshold.value[0]}	0.6000".split(
            '\t'),
        f"{ReportedScalarMetrics.AccuracyAtOptimalThreshold.value[0]}	0.5000".
        split('\t'),
        f"{ReportedScalarMetrics.AccuracyAtThreshold05.value[0]}	0.5000".split(
            '\t'),
        f"{ReportedScalarMetrics.Sensitivity.value[0]}	0.5000".split('\t'),
        f"{ReportedScalarMetrics.Specificity.value[0]}	0.5000".split('\t'),
    ]
    check_table_equality(header, rows, expected_header, expected_rows)
Example #15
0
def get_unique_prediction_target_combinations(
        config: ScalarModelBase) -> Set[FrozenSet[str]]:
    """
    Get a list of all the combinations of labels that exist in the dataset.

    For multilabel classification tasks, this function will return all unique combinations of labels that
    occur in the dataset csv.
    For example, if there are 6 samples in the dataset with the following ground truth labels
    Sample1: class1, class2
    Sample2: class0
    Sample3: class1
    Sample4: class2, class3
    Sample5: (all label classes are negative in Sample 5)
    Sample6: class1, class2
    This function will return {{"class1", "class2"}, {"class0"}, {"class1"},  {"class2", "class3"}, {}}

    For binary classification tasks (assume class_names has not been changed from ["Default"]):
    This function will return a set with two members - {{"Default"}, {}} if there is at least one positive example
    in the dataset. If there are no positive examples, it returns {{}}.
    """
    df = config.read_dataset_if_needed()
    dataset = ScalarDataset(args=config, data_frame=df)

    all_labels = [
        torch.flatten(torch.nonzero(item.label)).tolist()
        for item in dataset.items
    ]
    label_set = set(
        frozenset([config.class_names[i] for i in labels if not math.isnan(i)])
        for labels in all_labels)

    return label_set
Example #16
0
def test_imbalanced_sampler() -> None:
    # Simulate a highly imbalanced dataset with only one data point
    # with a negative label.
    csv_string = StringIO("""subject,channel,value,scalar1
    S1,label,True,1.0
    S2,label,True,1.0
    S3,label,True,1.0
    S4,label,True,1.0
    S5,label,True,1.0
    S6,label,False,1.0
    """)
    torch.manual_seed(0)
    df = pd.read_csv(csv_string, sep=",", dtype=str)
    args = ScalarModelBase(label_value_column="value",
                           numerical_columns=["scalar1"],
                           local_dataset=Path("fakepath"))
    dataset = ScalarDataset(args, data_frame=df)
    drawn_subjects = []
    for _ in range(10):
        data_loader = dataset.as_data_loader(use_imbalanced_sampler=True,
                                             shuffle=True, batch_size=6,
                                             num_dataload_workers=0)
        for batch in data_loader:
            drawn_subjects.extend([i.id.strip() for i in batch["metadata"]])
    counts_per_subjects = Counter(drawn_subjects)
    count_negative_subjects = counts_per_subjects["S6"]
    assert count_negative_subjects / float(len(drawn_subjects)) > 0.3
def test_get_metrics_table_crossval() -> None:
    reports_folder = Path(__file__).parent
    crossval_metrics_file = reports_folder / "crossval_metrics_classification.csv"

    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             subject_column="subject",
                             number_of_cross_validation_splits=3)
    rows, header = get_metrics_table_for_prediction_target(
        csv_to_set_optimal_threshold=crossval_metrics_file,
        data_split_to_set_optimal_threshold=ModelExecutionMode.VAL,
        csv_to_compute_metrics=crossval_metrics_file,
        data_split_to_compute_metrics=ModelExecutionMode.TEST,
        config=config,
        prediction_target=MetricsDict.DEFAULT_HUE_KEY,
        is_thresholded=False,
        is_crossval_report=True)
    expected_header = "Metric	Split 0	Split 1	Split 2	Mean (std)".split('\t')
    expected_rows = [
        f"{ReportedScalarMetrics.AUC_PR.value[0]}	0.5417	0.4481	0.6889	0.5595 (0.0991)"
        .split('\t'),
        f"{ReportedScalarMetrics.AUC_ROC.value[0]}	0.5000	0.2778	0.7222	0.5000 (0.1814)"
        .split('\t'),
        f"{ReportedScalarMetrics.OptimalThreshold.value[0]}	0.6000	0.6000	0.6000	0.6000 (0.0000)"
        .split('\t'),
        f"{ReportedScalarMetrics.AccuracyAtOptimalThreshold.value[0]}	0.5000	0.2500	0.7500	0.5000 (0.2041)"
        .split('\t'),
        f"{ReportedScalarMetrics.AccuracyAtThreshold05.value[0]}	0.5000	0.1667	0.8333	0.5000 (0.2722)"
        .split('\t'),
        f"{ReportedScalarMetrics.Sensitivity.value[0]}	0.5000	0.1667	0.8333	0.5000 (0.2722)"
        .split('\t'),
        f"{ReportedScalarMetrics.Specificity.value[0]}	0.5000	0.1667	0.8333	0.5000 (0.2722)"
        .split('\t')
    ]
    check_table_equality(header, rows, expected_header, expected_rows)
def create_scalar_loss_function(config: ScalarModelBase) -> torch.nn.Module:
    """
    Returns a torch module that computes a loss function for classification and regression models.
    """
    if config.loss_type == ScalarLoss.BinaryCrossEntropyWithLogits:
        return BinaryCrossEntropyWithLogitsLoss(num_classes=len(config.class_names),
                                                smoothing_eps=config.label_smoothing_eps)
    if config.loss_type == ScalarLoss.WeightedCrossEntropyWithLogits:
        return BinaryCrossEntropyWithLogitsLoss(
            num_classes=len(config.class_names),
            smoothing_eps=config.label_smoothing_eps,
            class_counts=config.get_training_class_counts(),
            num_train_samples=config.get_total_number_of_training_samples())
    elif config.loss_type == ScalarLoss.MeanSquaredError:
        return MSELoss()
    else:
        raise NotImplementedError(f"Loss type {config.loss_type} is not implemented")
Example #19
0
def _create_test_dataset(csv_path: Path, scalar_loss: ScalarLoss = ScalarLoss.BinaryCrossEntropyWithLogits,
                         categorical_columns: Optional[List[str]] = None) -> ScalarDataset:
    # Load items indirectly via a ScalarDataset object, to see if the wiring up of all column names works
    args = ScalarModelBase(image_channels=["image"],
                           image_file_column="path",
                           label_channels=["label"],
                           label_value_column="value",
                           non_image_feature_channels=["label"],
                           numerical_columns=["scalar1", "scalar2"],
                           categorical_columns=categorical_columns or list(),
                           subject_column="USUBJID",
                           channel_column="week",
                           local_dataset=csv_path,
                           should_validate=False,
                           loss_type=scalar_loss,
                           num_dataload_workers=0)
    args.read_dataset_into_dataframe_and_pre_process()
    return ScalarDataset(args)
Example #20
0
def test_filter_dataset_with_empty_list(expected_column_value: List[Tuple[str, str]]) -> None:
    """
    Test that empty filter has no effect
    """

    classification_config = ScalarModelBase(image_channels=["image"],
                                            image_file_column="path",
                                            label_channels=["label"],
                                            label_value_column="value",
                                            non_image_feature_channels={},
                                            numerical_columns=[],
                                            traverse_dirs_when_loading=True,
                                            expected_column_values=[],
                                            local_dataset=Path("fakepath"))
    data = {'Subject': ['1', '2', '3', '4'], 'DIM': ["1024x49x496", "512x49x496", "512x49x496", "512x49x496"]}
    df = pd.DataFrame(data)
    print(df.head())
    filtered = classification_config.filter_dataframe(df)
    assert_frame_equal(df, filtered)
def test_dataset_reader_workers() -> None:
    """
    Test to make sure the number of dataset reader workers are set correctly
    """
    config = ScalarModelBase(should_validate=False,
                             num_dataset_reader_workers=-1)
    if config.is_offline_run:
        assert config.num_dataset_reader_workers == -1
    else:
        assert config.num_dataset_reader_workers == 0
def test_generate_classification_crossval_report(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             subject_column="subject",
                             number_of_cross_validation_splits=3)
    generate_crossval_notebook(
        config,
        metrics_file="crossval_metrics_classification.csv",
        temp_folder=test_output_dirs.root_dir)
Example #23
0
    def __init__(self, config: ScalarModelBase, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(config, *args, **kwargs)
        self.model = config.create_model()
        raw_loss = model_util.create_scalar_loss_function(config)
        self.loss_fn = raw_loss

        self.target_names = config.target_names
        self.is_classification_model = config.is_classification_model
        self.use_mean_teacher_model = config.compute_mean_teacher_model
        self.is_binary_classification_or_regression = True if len(
            config.class_names) == 1 else False
        self.logits_to_posterior_fn = config.get_post_loss_logits_normalization_function(
        )
        self.loss_type = config.loss_type
        # These two fields store the PyTorch Lightning Metrics objects that will compute metrics on validation
        # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike)
        self.train_metric_computers = config.create_metric_computers()
        self.val_metric_computers = config.create_metric_computers()
        self.compute_and_log_metrics = config.compute_and_log_metrics
def test_generate_classification_report(
        test_output_dirs: OutputFolderForTests) -> None:
    reports_folder = Path(__file__).parent
    test_metrics_file = reports_folder / "test_metrics_classification.csv"
    val_metrics_file = reports_folder / "val_metrics_classification.csv"

    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             subject_column="subject")
    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    image_file_name = "image.npy"
    dataset_csv.write_text("subject,filePath,label\n"
                           f"0,0_{image_file_name},0\n"
                           f"1,1_{image_file_name},0\n"
                           f"2,0_{image_file_name},0\n"
                           f"3,1_{image_file_name},0\n"
                           f"4,0_{image_file_name},0\n"
                           f"5,1_{image_file_name},0\n"
                           f"6,0_{image_file_name},0\n"
                           f"7,1_{image_file_name},0\n"
                           f"8,0_{image_file_name},0\n"
                           f"9,1_{image_file_name},0\n"
                           f"10,0_{image_file_name},0\n"
                           f"11,1_{image_file_name},0\n")

    np.save(str(Path(config.local_dataset / f"0_{image_file_name}")),
            np.random.randint(0, 255, [5, 4]))
    np.save(str(Path(config.local_dataset / f"1_{image_file_name}")),
            np.random.randint(0, 255, [5, 4]))

    result_file = test_output_dirs.root_dir / "report.ipynb"
    result_html = generate_classification_notebook(
        result_notebook=result_file,
        config=config,
        val_metrics=val_metrics_file,
        test_metrics=test_metrics_file)
    assert result_file.is_file()
    assert result_html.is_file()
    assert result_html.suffix == ".html"
Example #25
0
def test_image_labels_from_subject_id_multiple(test_output_dirs: OutputFolderForTests) -> None:
    config = ScalarModelBase(label_channels=["label"],
                             label_value_column="label",
                             subject_column="subject",
                             class_names=["class1", "class2", "class3"])
    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    dataset_csv.write_text("subject,channel,label\n"
                           "0,label,0\n"
                           "0,image,\n"
                           "1,label,1|2\n"
                           "1,image,\n")

    df = config.read_dataset_if_needed()
    dataset = ScalarDataset(args=config, data_frame=df)

    labels = get_image_labels_from_subject_id(subject_id="1",
                                                  dataset=dataset,
                                                  config=config)
    assert labels
    assert len(labels) == 2
    assert set(labels) == {config.class_names[1], config.class_names[2]}
Example #26
0
def test_dataset_csv_with_ScalarModelBase(
        test_output_dirs: OutputFolderForTests) -> None:
    dataset_csv_path = create_dataset_csv(test_output_dirs)
    model_config = ScalarModelBase(should_validate=False)
    model_config.local_dataset = dataset_csv_path.parent
    model_config.dataset_csv = dataset_csv_path.name
    model_config.read_dataset_into_dataframe_and_pre_process()
    assert model_config.dataset_data_frame is not None
    validate_dataset_paths(model_config)
def test_dataset_normalize_image(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test dataset loading with window normalization image processing.
    """
    source_folder = str(full_ml_test_data_path() / "classification_data")
    target_folder = str(Path(test_output_dirs.make_sub_dir("foo")) / "bar")
    shutil.copytree(source_folder, target_folder)
    csv_string = StringIO("""subject,channel,path,value,scalar1
S1,image,4be9beed-5861-fdd2-72c2-8dd89aadc1ef
S1,label,,True,1.0
S2,image,6ceacaf8-abd2-ffec-2ade-d52afd6dd1be
S2,label,,True,2.0
S3,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4
S3,label,,False,3.0
S4,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4
S4,label,,False,3.0
""")
    df = pd.read_csv(csv_string, sep=",", dtype=str)
    args = ScalarModelBase(image_channels=["image"],
                           image_file_column="path",
                           label_channels=["label"],
                           label_value_column="value",
                           non_image_feature_channels={},
                           numerical_columns=[],
                           traverse_dirs_when_loading=True,
                           local_dataset=test_output_dirs.root_dir)
    raw_dataset = ScalarDataset(args, data_frame=df)
    normalized = ScalarDataset(
        args,
        data_frame=df,
        sample_transforms=WindowNormalizationForScalarItem())
    assert len(raw_dataset) == 4
    for i in range(4):
        raw_item = raw_dataset[i]
        normalized_item = normalized[i]
        normalized_images = normalized_item["images"]
        assert isinstance(raw_item, dict)
        expected_normalized_images = torch.tensor(
            mri_window(raw_item["images"].numpy(),
                       mask=None,
                       output_range=(0, 1))[0])
        assert normalized_images is not None
        assert torch.is_tensor(normalized_images)
        assert expected_normalized_images.shape == normalized_images.shape
        expected_image_size = (4, 5, 7)
        assert normalized_images.shape == (1, ) + expected_image_size
        assert torch.all(expected_normalized_images == normalized_images)
Example #28
0
def test_generate_classification_crossval_report(
        test_output_dirs: OutputFolderForTests) -> None:
    reports_folder = Path(__file__).parent
    crossval_metrics_file = reports_folder / "crossval_metrics_classification.csv"

    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             subject_column="subject",
                             number_of_cross_validation_splits=3)

    result_file = test_output_dirs.root_dir / "report.ipynb"
    result_html = generate_classification_crossval_notebook(
        result_notebook=result_file,
        config=config,
        crossval_metrics=crossval_metrics_file)
    assert result_file.is_file()
    assert result_html.is_file()
    assert result_html.suffix == ".html"
Example #29
0
def test_get_total_number_of_cross_validation_runs(number_of_cross_validation_splits_per_fold: int) -> None:
    config = ScalarModelBase(should_validate=False)
    config.number_of_cross_validation_splits = 2
    config.number_of_cross_validation_splits_per_fold = number_of_cross_validation_splits_per_fold
    assert config.perform_cross_validation

    if number_of_cross_validation_splits_per_fold > 0:
        assert config.perform_sub_fold_cross_validation
        assert config.get_total_number_of_cross_validation_runs() \
               == config.number_of_cross_validation_splits * number_of_cross_validation_splits_per_fold
    else:
        assert not config.perform_sub_fold_cross_validation
        assert config.get_total_number_of_cross_validation_runs() == config.number_of_cross_validation_splits
Example #30
0
def get_scalar_model_inputs_and_labels(
        model_config: ScalarModelBase, model: torch.nn.Module,
        sample: Dict[str, Any]) -> ScalarModelInputsAndLabels:
    """
    For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader.
    :param model_config: The configuration object for the model.
    :param model: The instantiated PyTorch model.
    :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value)
    :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors,
    label tensor, subject IDs, and the data item reconstructed from the data loader output
    """
    if isinstance(model, DataParallelModel):
        model = model.get_module()

    if isinstance(model_config, SequenceModelBase):
        sequence_model: DeviceAwareModule[List[ClassificationItemSequence],
                                          torch.Tensor] = model  # type: ignore
        sequences = ClassificationItemSequence.from_minibatch(sample)
        subject_ids = [x.id for x in sequences]
        labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
            sequences=sequences,
            target_indices=model_config.get_target_indices())
        model_inputs = sequence_model.get_input_tensors(sequences)

        return ScalarModelInputsAndLabels[List[ClassificationItemSequence],
                                          torch.Tensor](
                                              model_inputs=model_inputs,
                                              labels=labels,
                                              subject_ids=subject_ids,
                                              data_item=sequences)
    else:
        scalar_model: DeviceAwareModule[ScalarItem,
                                        torch.Tensor] = model  # type: ignore
        scalar_item = ScalarItem.from_dict(sample)
        subject_ids = [str(x.id) for x in scalar_item.metadata]  # type: ignore
        model_inputs = scalar_model.get_input_tensors(scalar_item)

        return ScalarModelInputsAndLabels[ScalarItem, torch.Tensor](
            model_inputs=model_inputs,
            labels=scalar_item.label,
            subject_ids=subject_ids,
            data_item=scalar_item)