def test_standardize_features_when_singleton(is_sequence: bool) -> None: """ Test how feature standardize copes with datasets that only have 1 entry. """ numerical_features = torch.ones((1, 3)) categorical_features = torch.tensor([[0, 1, 1], [1, 0, 0]]) item: Union[SequenceDataSource, ScalarDataSource] sources: Union[ListOfSequences, List[ScalarDataSource]] if is_sequence: item = SequenceDataSource( metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=numerical_features, categorical_non_image_features=categorical_features, label=torch.tensor([]), channel_files=[]) sources = [ClassificationItemSequence(id="foo", items=[item])] mean_std = FeatureStatistics.from_data_sources(sources) else: item = ScalarDataSource( metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=numerical_features, categorical_non_image_features=categorical_features, label=torch.tensor([]), channel_files=[]) sources = [item] mean_std = FeatureStatistics.from_data_sources(sources) assert_tensors_equal(mean_std.mean, numerical_features) # Standard deviation can't be computed because there is only one element, hence becomes nan. assert torch.all(torch.isnan(mean_std.std)) # When applying such a standardization to the sequences, they should not be changed (similar to features that # are constant) standardized_sources = mean_std.standardize(sources) if is_sequence: assert_tensors_equal( standardized_sources[0].items[0].numerical_non_image_features, numerical_features) assert_tensors_equal( standardized_sources[0].items[0].categorical_non_image_features, categorical_features) else: assert_tensors_equal( standardized_sources[0].numerical_non_image_features, numerical_features) assert_tensors_equal( standardized_sources[0].categorical_non_image_features, categorical_features)
def test_standardize_features() -> None: """ Test if the non-image feature can be normalized to mean 0, std 1. :return: """ set_random_seed(1234) expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]]) expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]]) feature_size = (2, 3) sequences: List[ClassificationItemSequence] = [] for s in range(1000): items = [] seq_length = torch.randint(low=3, high=6, size=(1, )).item() for i in range(seq_length): # type: ignore # All features are random Gaussian, apart from feature 0 which is constant. # Normalization must be able to deal with constant features when dividing by standard deviation. features = torch.randn(size=feature_size, dtype=torch.float32 ) * expected_std + expected_mean # Randomly put some infinite values in the vector features[s % 2, s % 3] = np.inf if torch.rand(1) > 0.9 else features[s % 2, s % 3] features[0, 0] = expected_mean[0, 0] item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=features, categorical_non_image_features=features, label=torch.tensor([]), images=torch.tensor([]), segmentations=torch.tensor([])) items.append(item) sequences.append(ClassificationItemSequence(id="foo", items=items)) mean_std = FeatureStatistics.from_data_sources(sequences) assert mean_std.mean.shape == feature_size assert mean_std.std.shape == feature_size assert_tensors_equal(mean_std.mean, expected_mean, 0.07) assert_tensors_equal(mean_std.std, expected_std, 0.07) # After normalization, mean should be 0, and std should be 1. standardized_seq = mean_std.standardize(sequences) mean_std_from_standardized = FeatureStatistics.from_data_sources( standardized_seq) # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched, # hence its mean is the original feature value. expected_mean_from_standardized = torch.zeros(feature_size) expected_mean_from_standardized[0, 0] = expected_mean[0, 0] expected_std_from_standardized = torch.ones(feature_size) expected_std_from_standardized[0, 0] = 0.0 assert_tensors_equal(mean_std_from_standardized.mean, expected_mean_from_standardized, abs=1e-5) assert_tensors_equal(mean_std_from_standardized.std, expected_std_from_standardized, abs=1e-5)
def test_load_items_seq() -> None: """ Test loading file paths and labels from a datafrome if """ csv_string = StringIO("""subject,seq,path,value,scalar1,scalar2,META S1,0,foo.nii,,0,0,M1 S1,1,,True,1.1,1.2,M2 S2,1,bar.nii,False,2.1,2.2, """) df = pd.read_csv(csv_string, sep=",", dtype=str) items: List[SequenceDataSource] = DataSourceReader[SequenceDataSource]( data_frame=df, image_channels=None, image_file_column="path", label_channels=None, label_value_column="value", numerical_columns=["scalar1", "scalar2"], sequence_column="seq").load_data_sources() assert len(items) == 3 assert isinstance(items[0].metadata, GeneralSampleMetadata) assert items[0].metadata.id == "S1" assert items[0].metadata.props == {"META": "M1"} assert items[0].metadata.sequence_position == 0 assert len(items[0].label.tolist()) == 1 assert math.isnan(items[0].label.item()) assert items[0].channel_files == ["foo.nii"] assert_tensors_equal(items[0].numerical_non_image_features, [0.0, 0.0]) assert isinstance(items[1].metadata, GeneralSampleMetadata) assert items[1].metadata.id == "S1" assert items[1].metadata.props == {"META": "M2"} assert items[1].metadata.sequence_position == 1 assert_tensors_equal(items[1].label, [1.0]) assert items[1].channel_files == [''] assert_tensors_equal(items[1].numerical_non_image_features, [1.1, 1.2]) assert isinstance(items[2].metadata, GeneralSampleMetadata) assert items[2].metadata.id == "S2" assert items[2].metadata.props == {"META": ''} assert items[2].metadata.sequence_position == 1 assert_tensors_equal(items[2].label, [0.0]) assert items[2].channel_files == ["bar.nii"] assert_tensors_equal(items[2].numerical_non_image_features, [2.1, 2.2])
def test_load_items_seq_from_dataset() -> None: """ Test loading a sequence dataset with numerical, categorical features and images. """ dummy_dataset = full_ml_test_data_path( ) / "sequence_data_for_classification" / "dataset.csv" df = pd.read_csv(dummy_dataset, sep=",", dtype=str) items: List[SequenceDataSource] = DataSourceReader[SequenceDataSource]( data_frame=df, image_channels=None, image_file_column="IMG", label_channels=None, label_value_column="Label", numerical_columns=["NUM1", "NUM2", "NUM3", "NUM4"], sequence_column="Position").load_data_sources() assert len(items) == 3 * 9 # 3 subjects, 9 visits each, no missing assert items[0].metadata.id == "2137.00005" assert items[0].metadata.sequence_position == 0 assert items[0].metadata.props["CAT2"] == "category_A" # One of the labels is missing, missing labels should be encoded as NaN assert math.isnan(items[0].label[0]) assert items[0].channel_files == ["img_1"] assert str(items[0].numerical_non_image_features.tolist()) == str( [362.0, np.nan, np.nan, 71.0]) assert items[8].metadata.id == "2137.00005" assert items[8].metadata.sequence_position == 8 assert items[8].label.tolist() == [0.0] assert items[8].channel_files == [''] assert str(items[8].numerical_non_image_features.tolist()) == str( [350.0, np.nan, np.nan, 8.0]) assert items[16].metadata.id == "2627.00001" assert items[16].label.tolist() == [0.0] assert items[16].channel_files == ["img_2"] assert_tensors_equal(items[16].numerical_non_image_features, [217.0, 0.0, 0.01, 153.0]) assert items[26].metadata.id == "3250.00005" assert items[26].metadata.sequence_position == 8 assert_tensors_equal(items[26].label, [0.0]) assert items[26].channel_files == ["img_11"] assert_tensors_equal(items[26].numerical_non_image_features, [238.0, 0.0, 0.02, 84.0]) grouped = group_samples_into_sequences( filter_valid_classification_data_sources_items( items, file_to_path_mapping=None, max_sequence_position_value=None)) # There are 3 patients total, but one of them has missing measurements for all visits assert len(grouped) == 2 assert grouped[0].id == "2627.00001" assert grouped[1].id == "3250.00005" # 2627.00001 has full information for weeks 0, 4, and 8 assert len(grouped[0].items) == 3 assert grouped[0].items[0].metadata["VISIT"] == "V1" assert grouped[0].items[2].metadata["VISIT"] == "VST 3" assert len(grouped[1].items) == 9 assert items[16].metadata.sequence_position == 7
def test_add_difference_features() -> None: """ Test if we can add difference features for sequence data sets (differences from position i compared to position 0 in the sequence) """ def _create(features: List) -> SequenceDataSource: return SequenceDataSource( metadata=GeneralSampleMetadata(id="foo"), channel_files=[], label=torch.tensor([]), categorical_non_image_features=torch.tensor([]), numerical_non_image_features=torch.tensor(features).float()) item1 = _create([[1, 2, 3], [4, 5, 6]]) item2 = _create([[11, 22, 33], [44, 55, 66]]) items = [ ClassificationItemSequence[SequenceDataSource](id="bar", items=[item1, item2]) ] updated = add_difference_features(items, [0, 2]) # The two difference features should be added along dimension 1 of the tensor assert updated[0].items[0].numerical_non_image_features.shape == (2, 5) # Item 0 should have differences of 0 assert_tensors_equal( updated[0].items[0].numerical_non_image_features[:, 0:3], item1.numerical_non_image_features) assert_tensors_equal( updated[0].items[0].numerical_non_image_features[:, 3:5], [[0, 0], [0, 0]]) # Item 1 should have non-zero diff, and keep the original non-image features in the first few dim assert_tensors_equal( updated[0].items[1].numerical_non_image_features[:, 0:3], item2.numerical_non_image_features) assert_tensors_equal( updated[0].items[1].numerical_non_image_features[:, 3:5], [[10, 30], [40, 60]])
def test_sequence_dataset_all(test_output_dirs: OutputFolderForTests) -> None: """ Check that the sequence dataset works end-to-end, including applying the right standardization. """ csv_string = """subject,seq,value,scalar1,scalar2,META,BETA S1,0,False,0,0,M1,B1 S1,1,True,1,10,M2,B2 S2,0,False,2,20,M2,B1 S3,0,True,3,30,M1,B1 S4,0,True,4,40,M2,B1 """ csv_path = create_dataset_csv_file(csv_string, test_output_dirs.root_dir) config = SequenceModelBase(local_dataset=csv_path, image_file_column=None, label_value_column="value", numerical_columns=["scalar1", "scalar2"], sequence_target_positions=[0], categorical_columns=["META", "BETA"], sequence_column="seq", num_dataload_workers=0, train_batch_size=2, should_validate=False, shuffle=False) config.read_dataset_if_needed() df = config.dataset_data_frame assert df is not None df1 = df[df.subject.isin(["S1", "S2"])] df2 = df[df.subject == "S3"] df3 = df[df.subject == "S4"] splits = DatasetSplits(train=df1, val=df2, test=df3) with mock.patch.object(SequenceModelBase, 'get_model_train_test_dataset_splits', return_value=splits): train_val_loaders = config.create_data_loaders() # Expected feature mean: Mean of the training data (0, 0), (1, 10), (2, 20) = (1, 10) # Expected (biased corrected) std estimate: Std of (0, 0), (1, 10), (2, 20) = (1, 10) feature_stats = config.get_torch_dataset_for_inference( ModelExecutionMode.TRAIN).feature_statistics assert feature_stats is not None assert_tensors_equal(feature_stats.mean, [1, 10]) assert_tensors_equal(feature_stats.std, [1, 10]) train_items = list( ClassificationItemSequence.from_minibatch(b) for b in train_val_loaders[ModelExecutionMode.TRAIN]) assert len( train_items ) == 1, "2 items in training set with batch size of 2 should return 1 minibatch" assert len(train_items[0]) == 2 assert train_items[0][0].id == "S1" assert_tensors_equal( train_items[0][0].items[0].get_all_non_imaging_features(), [-1., -1., 1., 0., 1., 0.]) assert_tensors_equal( train_items[0][0].items[1].get_all_non_imaging_features(), [0., 0., 0., 1., 0., 1.]) assert train_items[0][1].id == "S2" assert_tensors_equal( train_items[0][1].items[0].get_all_non_imaging_features(), [1., 1., 0., 1., 1., 0.]) val_items = list( ClassificationItemSequence.from_minibatch(b) for b in train_val_loaders[ModelExecutionMode.VAL]) assert len(val_items) == 1 assert len(val_items[0]) == 1 assert val_items[0][0].id == "S3" # Items in the validation set should be normalized using the mean and std on the training data. # Hence, the non-image features (3, 30) should turn into (2, 2) assert_tensors_equal( val_items[0][0].items[0].get_all_non_imaging_features(), [2., 2., 1., 0., 1., 0.]) # Check that the test set is also normalized correctly using the training mean and std. test_items = list( ClassificationItemSequence(**b) for b in config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)) assert test_items[0].id == "S4" # Check Non-image features of (4, 40) assert_tensors_equal( test_items[0].items[0].get_all_non_imaging_features(), [3., 3., 0., 1., 1., 0.])