Example #1
0
def test_standardize_features_when_singleton(is_sequence: bool) -> None:
    """
    Test how feature standardize copes with datasets that only have 1 entry.
    """
    numerical_features = torch.ones((1, 3))
    categorical_features = torch.tensor([[0, 1, 1], [1, 0, 0]])
    item: Union[SequenceDataSource, ScalarDataSource]
    sources: Union[ListOfSequences, List[ScalarDataSource]]
    if is_sequence:
        item = SequenceDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])
        sources = [ClassificationItemSequence(id="foo", items=[item])]
        mean_std = FeatureStatistics.from_data_sources(sources)
    else:
        item = ScalarDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])

        sources = [item]
        mean_std = FeatureStatistics.from_data_sources(sources)

    assert_tensors_equal(mean_std.mean, numerical_features)
    # Standard deviation can't be computed because there is only one element, hence becomes nan.
    assert torch.all(torch.isnan(mean_std.std))
    # When applying such a standardization to the sequences, they should not be changed (similar to features that
    # are constant)
    standardized_sources = mean_std.standardize(sources)
    if is_sequence:
        assert_tensors_equal(
            standardized_sources[0].items[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].items[0].categorical_non_image_features,
            categorical_features)
    else:
        assert_tensors_equal(
            standardized_sources[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].categorical_non_image_features,
            categorical_features)
Example #2
0
def test_standardize_features() -> None:
    """
    Test if the non-image feature can be normalized to mean 0, std 1.
    :return:
    """
    set_random_seed(1234)
    expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]])
    expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]])
    feature_size = (2, 3)
    sequences: List[ClassificationItemSequence] = []
    for s in range(1000):
        items = []
        seq_length = torch.randint(low=3, high=6, size=(1, )).item()
        for i in range(seq_length):  # type: ignore
            # All features are random Gaussian, apart from feature 0 which is constant.
            # Normalization must be able to deal with constant features when dividing by standard deviation.
            features = torch.randn(size=feature_size, dtype=torch.float32
                                   ) * expected_std + expected_mean
            # Randomly put some infinite values in the vector
            features[s % 2, s %
                     3] = np.inf if torch.rand(1) > 0.9 else features[s % 2,
                                                                      s % 3]
            features[0, 0] = expected_mean[0, 0]
            item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"),
                              numerical_non_image_features=features,
                              categorical_non_image_features=features,
                              label=torch.tensor([]),
                              images=torch.tensor([]),
                              segmentations=torch.tensor([]))
            items.append(item)
        sequences.append(ClassificationItemSequence(id="foo", items=items))
    mean_std = FeatureStatistics.from_data_sources(sequences)
    assert mean_std.mean.shape == feature_size
    assert mean_std.std.shape == feature_size

    assert_tensors_equal(mean_std.mean, expected_mean, 0.07)
    assert_tensors_equal(mean_std.std, expected_std, 0.07)

    # After normalization, mean should be 0, and std should be 1.
    standardized_seq = mean_std.standardize(sequences)
    mean_std_from_standardized = FeatureStatistics.from_data_sources(
        standardized_seq)
    # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched,
    # hence its mean is the original feature value.
    expected_mean_from_standardized = torch.zeros(feature_size)
    expected_mean_from_standardized[0, 0] = expected_mean[0, 0]
    expected_std_from_standardized = torch.ones(feature_size)
    expected_std_from_standardized[0, 0] = 0.0
    assert_tensors_equal(mean_std_from_standardized.mean,
                         expected_mean_from_standardized,
                         abs=1e-5)
    assert_tensors_equal(mean_std_from_standardized.std,
                         expected_std_from_standardized,
                         abs=1e-5)
Example #3
0
def test_load_items_seq() -> None:
    """
    Test loading file paths and labels from a datafrome if
    """
    csv_string = StringIO("""subject,seq,path,value,scalar1,scalar2,META
S1,0,foo.nii,,0,0,M1
S1,1,,True,1.1,1.2,M2
S2,1,bar.nii,False,2.1,2.2,
""")
    df = pd.read_csv(csv_string, sep=",", dtype=str)
    items: List[SequenceDataSource] = DataSourceReader[SequenceDataSource](
        data_frame=df,
        image_channels=None,
        image_file_column="path",
        label_channels=None,
        label_value_column="value",
        numerical_columns=["scalar1", "scalar2"],
        sequence_column="seq").load_data_sources()

    assert len(items) == 3
    assert isinstance(items[0].metadata, GeneralSampleMetadata)
    assert items[0].metadata.id == "S1"
    assert items[0].metadata.props == {"META": "M1"}
    assert items[0].metadata.sequence_position == 0
    assert len(items[0].label.tolist()) == 1
    assert math.isnan(items[0].label.item())
    assert items[0].channel_files == ["foo.nii"]
    assert_tensors_equal(items[0].numerical_non_image_features, [0.0, 0.0])
    assert isinstance(items[1].metadata, GeneralSampleMetadata)
    assert items[1].metadata.id == "S1"
    assert items[1].metadata.props == {"META": "M2"}
    assert items[1].metadata.sequence_position == 1
    assert_tensors_equal(items[1].label, [1.0])
    assert items[1].channel_files == ['']
    assert_tensors_equal(items[1].numerical_non_image_features, [1.1, 1.2])
    assert isinstance(items[2].metadata, GeneralSampleMetadata)
    assert items[2].metadata.id == "S2"
    assert items[2].metadata.props == {"META": ''}
    assert items[2].metadata.sequence_position == 1
    assert_tensors_equal(items[2].label, [0.0])
    assert items[2].channel_files == ["bar.nii"]
    assert_tensors_equal(items[2].numerical_non_image_features, [2.1, 2.2])
Example #4
0
def test_load_items_seq_from_dataset() -> None:
    """
    Test loading a sequence dataset with numerical, categorical features and images.
    """
    dummy_dataset = full_ml_test_data_path(
    ) / "sequence_data_for_classification" / "dataset.csv"
    df = pd.read_csv(dummy_dataset, sep=",", dtype=str)
    items: List[SequenceDataSource] = DataSourceReader[SequenceDataSource](
        data_frame=df,
        image_channels=None,
        image_file_column="IMG",
        label_channels=None,
        label_value_column="Label",
        numerical_columns=["NUM1", "NUM2", "NUM3", "NUM4"],
        sequence_column="Position").load_data_sources()
    assert len(items) == 3 * 9  # 3 subjects, 9 visits each, no missing
    assert items[0].metadata.id == "2137.00005"
    assert items[0].metadata.sequence_position == 0
    assert items[0].metadata.props["CAT2"] == "category_A"
    # One of the labels is missing, missing labels should be encoded as NaN
    assert math.isnan(items[0].label[0])
    assert items[0].channel_files == ["img_1"]
    assert str(items[0].numerical_non_image_features.tolist()) == str(
        [362.0, np.nan, np.nan, 71.0])
    assert items[8].metadata.id == "2137.00005"
    assert items[8].metadata.sequence_position == 8
    assert items[8].label.tolist() == [0.0]
    assert items[8].channel_files == ['']
    assert str(items[8].numerical_non_image_features.tolist()) == str(
        [350.0, np.nan, np.nan, 8.0])
    assert items[16].metadata.id == "2627.00001"
    assert items[16].label.tolist() == [0.0]
    assert items[16].channel_files == ["img_2"]
    assert_tensors_equal(items[16].numerical_non_image_features,
                         [217.0, 0.0, 0.01, 153.0])
    assert items[26].metadata.id == "3250.00005"
    assert items[26].metadata.sequence_position == 8
    assert_tensors_equal(items[26].label, [0.0])
    assert items[26].channel_files == ["img_11"]
    assert_tensors_equal(items[26].numerical_non_image_features,
                         [238.0, 0.0, 0.02, 84.0])

    grouped = group_samples_into_sequences(
        filter_valid_classification_data_sources_items(
            items, file_to_path_mapping=None,
            max_sequence_position_value=None))
    # There are 3 patients total, but one of them has missing measurements for all visits
    assert len(grouped) == 2
    assert grouped[0].id == "2627.00001"
    assert grouped[1].id == "3250.00005"
    # 2627.00001 has full information for weeks 0, 4, and 8
    assert len(grouped[0].items) == 3
    assert grouped[0].items[0].metadata["VISIT"] == "V1"
    assert grouped[0].items[2].metadata["VISIT"] == "VST 3"
    assert len(grouped[1].items) == 9
    assert items[16].metadata.sequence_position == 7
Example #5
0
def test_add_difference_features() -> None:
    """
    Test if we can add difference features for sequence data sets (differences from position i compared to position 0
    in the sequence)
    """
    def _create(features: List) -> SequenceDataSource:
        return SequenceDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            channel_files=[],
            label=torch.tensor([]),
            categorical_non_image_features=torch.tensor([]),
            numerical_non_image_features=torch.tensor(features).float())

    item1 = _create([[1, 2, 3], [4, 5, 6]])
    item2 = _create([[11, 22, 33], [44, 55, 66]])
    items = [
        ClassificationItemSequence[SequenceDataSource](id="bar",
                                                       items=[item1, item2])
    ]
    updated = add_difference_features(items, [0, 2])
    # The two difference features should be added along dimension 1 of the tensor
    assert updated[0].items[0].numerical_non_image_features.shape == (2, 5)
    # Item 0 should have differences of 0
    assert_tensors_equal(
        updated[0].items[0].numerical_non_image_features[:, 0:3],
        item1.numerical_non_image_features)
    assert_tensors_equal(
        updated[0].items[0].numerical_non_image_features[:, 3:5],
        [[0, 0], [0, 0]])
    # Item 1 should have non-zero diff, and keep the original non-image features in the first few dim
    assert_tensors_equal(
        updated[0].items[1].numerical_non_image_features[:, 0:3],
        item2.numerical_non_image_features)
    assert_tensors_equal(
        updated[0].items[1].numerical_non_image_features[:, 3:5],
        [[10, 30], [40, 60]])
Example #6
0
def test_sequence_dataset_all(test_output_dirs: OutputFolderForTests) -> None:
    """
    Check that the sequence dataset works end-to-end, including applying the right standardization.
    """
    csv_string = """subject,seq,value,scalar1,scalar2,META,BETA
S1,0,False,0,0,M1,B1
S1,1,True,1,10,M2,B2
S2,0,False,2,20,M2,B1
S3,0,True,3,30,M1,B1
S4,0,True,4,40,M2,B1
"""
    csv_path = create_dataset_csv_file(csv_string, test_output_dirs.root_dir)
    config = SequenceModelBase(local_dataset=csv_path,
                               image_file_column=None,
                               label_value_column="value",
                               numerical_columns=["scalar1", "scalar2"],
                               sequence_target_positions=[0],
                               categorical_columns=["META", "BETA"],
                               sequence_column="seq",
                               num_dataload_workers=0,
                               train_batch_size=2,
                               should_validate=False,
                               shuffle=False)
    config.read_dataset_if_needed()
    df = config.dataset_data_frame
    assert df is not None
    df1 = df[df.subject.isin(["S1", "S2"])]
    df2 = df[df.subject == "S3"]
    df3 = df[df.subject == "S4"]
    splits = DatasetSplits(train=df1, val=df2, test=df3)
    with mock.patch.object(SequenceModelBase,
                           'get_model_train_test_dataset_splits',
                           return_value=splits):
        train_val_loaders = config.create_data_loaders()
        # Expected feature mean: Mean of the training data (0, 0), (1, 10), (2, 20) = (1, 10)
        # Expected (biased corrected) std estimate: Std of (0, 0), (1, 10), (2, 20) = (1, 10)
        feature_stats = config.get_torch_dataset_for_inference(
            ModelExecutionMode.TRAIN).feature_statistics
        assert feature_stats is not None
        assert_tensors_equal(feature_stats.mean, [1, 10])
        assert_tensors_equal(feature_stats.std, [1, 10])

        train_items = list(
            ClassificationItemSequence.from_minibatch(b)
            for b in train_val_loaders[ModelExecutionMode.TRAIN])
        assert len(
            train_items
        ) == 1, "2 items in training set with batch size of 2 should return 1 minibatch"
        assert len(train_items[0]) == 2
        assert train_items[0][0].id == "S1"
        assert_tensors_equal(
            train_items[0][0].items[0].get_all_non_imaging_features(),
            [-1., -1., 1., 0., 1., 0.])
        assert_tensors_equal(
            train_items[0][0].items[1].get_all_non_imaging_features(),
            [0., 0., 0., 1., 0., 1.])
        assert train_items[0][1].id == "S2"
        assert_tensors_equal(
            train_items[0][1].items[0].get_all_non_imaging_features(),
            [1., 1., 0., 1., 1., 0.])
        val_items = list(
            ClassificationItemSequence.from_minibatch(b)
            for b in train_val_loaders[ModelExecutionMode.VAL])
        assert len(val_items) == 1
        assert len(val_items[0]) == 1
        assert val_items[0][0].id == "S3"
        # Items in the validation set should be normalized using the mean and std on the training data.
        # Hence, the non-image features (3, 30) should turn into (2, 2)
        assert_tensors_equal(
            val_items[0][0].items[0].get_all_non_imaging_features(),
            [2., 2., 1., 0., 1., 0.])

        # Check that the test set is also normalized correctly using the training mean and std.
        test_items = list(
            ClassificationItemSequence(**b) for b in
            config.get_torch_dataset_for_inference(ModelExecutionMode.TEST))
        assert test_items[0].id == "S4"
        # Check Non-image features of (4, 40)
        assert_tensors_equal(
            test_items[0].items[0].get_all_non_imaging_features(),
            [3., 3., 0., 1., 1., 0.])