Exemple #1
0
def test_create_labels_tensor_for_minibatch() -> None:
    """
    Test to make sure labels tensor is created as expected for minibatch
    """

    sequences = [
        ClassificationItemSequence(id=x,
                                   items=_create_scalar_items(length=i + 1))
        for i, x in enumerate(["A", "B"])
    ]

    labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
        sequences, target_indices=[0, 1, 2])
    assert torch.allclose(labels,
                          torch.tensor([[[1.0], [np.nan], [np.nan]],
                                        [[1.0], [1.0], [np.nan]]]),
                          equal_nan=True)

    labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
        sequences, target_indices=[0, 1])
    assert torch.allclose(labels,
                          torch.tensor([[[1.0], [np.nan]], [[1.0], [1.0]]]),
                          equal_nan=True)

    labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
        sequences, target_indices=[0])
    assert torch.equal(labels, torch.tensor([[[1.0]], [[1.0]]]))
Exemple #2
0
 def apply_sequence(
         seq: ClassificationItemSequence) -> ClassificationItemSequence:
     # noinspection PyTypeChecker
     return ClassificationItemSequence(id=seq.id,
                                       items=list(
                                           map(apply_source,
                                               seq.items)))
Exemple #3
0
def test_sequence_dataloader() -> None:
    """
    Test if we can create a data loader from the dataset, and recover the items as expected in batched form.
    Including instances where not all elements of the sequence have labels.
    """
    csv_string = StringIO("""subject,seq,path,value,scalar1,scalar2,META
S1,0,foo.nii,,0,0,M1
S1,1,,True,1.1,1.2,M2
S2,0,bar.nii,False,2.1,2.2,M3
S2,1,,False,2.0,2.0,M4
""")
    df = pd.read_csv(csv_string, sep=",", dtype=str)
    config = SequenceModelBase(image_file_column=None,
                               label_value_column="value",
                               numerical_columns=["scalar1"],
                               sequence_target_positions=[1],
                               sequence_column="seq",
                               local_dataset=Path.cwd(),
                               should_validate=False)
    dataset = SequenceDataset(config, data_frame=df)
    assert len(dataset) == 2
    data_loader = dataset.as_data_loader(shuffle=False,
                                         batch_size=2,
                                         num_dataload_workers=0)
    # We have 2 subjects, with a batch size of 2 those should be turned into 1 batch
    data_loader_output = list(i for i in data_loader)
    assert len(data_loader_output) == 1
    loaded = list(ClassificationItemSequence(**i) for i in data_loader_output)
    assert loaded[0].id == ["S1", "S2"]
    assert isinstance(loaded[0].items[0][0], ScalarItem)
    assert loaded[0].items[0][0].metadata.id == "S1"
    assert loaded[0].items[0][1].metadata.id == "S1"
    assert loaded[0].items[1][0].metadata.id == "S2"
    assert loaded[0].items[1][1].metadata.id == "S2"

    # The batched sequence data are awkward to work with. Check if we can un-roll them correctly via
    # from_minibatch
    un_batched = ClassificationItemSequence.from_minibatch(
        data_loader_output[0])
    assert len(un_batched) == 2
    for i in range(2):
        assert un_batched[i].id == dataset.items[i].id
        assert len(un_batched[i].items) == len(dataset.items[i].items)
        for j in range(len(un_batched[i].items)):
            assert un_batched[i].items[j].metadata.id == dataset.items[
                i].items[j].metadata.id
Exemple #4
0
def test_get_labels_at_target_indices() -> None:
    """
    Test to ensure label selection based on target indices is as expected
    """
    sequence_items = _create_scalar_items(length=3)

    sequence = ClassificationItemSequence(id="A", items=sequence_items)

    # since label at sequence position 3 will not exist, we expect the result tensor to be padded with a nan
    labels = sequence.get_labels_at_target_indices(target_indices=[0, 1, 2, 3])
    assert torch.allclose(labels,
                          torch.tensor([[1.0], [1.0], [1.0], [np.nan]]),
                          equal_nan=True)

    # test we can extract all of the labels in the sequence
    labels = sequence.get_labels_at_target_indices(target_indices=[0, 1, 2])
    assert torch.equal(labels, torch.tensor([[1.0], [1.0], [1.0]]))

    # test we can extract only a subset of the labels in the sequence
    labels = sequence.get_labels_at_target_indices(target_indices=[0, 1])
    assert torch.equal(labels, torch.tensor([[1.0], [1.0]]))

    # test we raise an exception for invalid target indices
    with pytest.raises(Exception):
        sequence.get_labels_at_target_indices(target_indices=[-1])
def group_samples_into_sequences(
        items: Iterable[SequenceDataSource],
        min_sequence_position_value: int = 0,
        max_sequence_position_value: Optional[int] = None) -> ListOfSequences:
    """
    Turns a flat list of classification items into a list of per-subject classification items. The resulting list
    has one entry per unique sample ID in the input. With a single sample ID, the items
    are sorted by metadata.sequence_position in ascending order.
    Also, all subject data is restricted to the largest contiguous sequence starting at 0
    (e.g., if sequence positions are [0, 1, 4], only [0, 1] are retained,
    if sequence positions are [1, 2, 3] nothing is retained)
    :param items: The items that should be grouped.
    :param max_sequence_position_value: If provided then this is the maximum sequence position the sequence can
    end with. Longer sequences will be truncated. None is default.
    up to and including this value. Entries beyond that sequence_position will be dropped.
    :param min_sequence_position_value: All sequences must have a entries with sequence_position starting
    from and including this value, 0 is default.
    :return:
    """
    if min_sequence_position_value < 0:
        raise ValueError("Argument min_sequence_position_value must be >= 0")

    if max_sequence_position_value:
        if max_sequence_position_value < min_sequence_position_value:
            raise ValueError(
                f"Argument max_sequence_position_value: {max_sequence_position_value} must "
                f"be >= min_sequence_position_value: {min_sequence_position_value}"
            )

    grouped: DefaultDict[str, List[SequenceDataSource]] = defaultdict(list)
    for item in items:
        grouped[item.id].append(item)
    result: List[ClassificationItemSequence[SequenceDataSource]] = []
    for sample_id, items in grouped.items():
        unique_positions = set(x.metadata.sequence_position for x in items)
        if len(unique_positions) != len(items):
            raise ValueError(
                f"The set of sequence positions for subject {sample_id} contains duplicates."
            )

        group_sorted = get_longest_contiguous_sequence(
            items=items,
            min_sequence_position_value=min_sequence_position_value,
            max_sequence_position_value=max_sequence_position_value)

        if len(group_sorted) > 0:
            result.append(
                ClassificationItemSequence(id=sample_id, items=group_sorted))
        else:
            # No contiguous sequence at all
            logging.warning(
                f"Skipped sequence for subject {sample_id} as it was not contiguous"
            )

    return result
Exemple #6
0
def get_scalar_model_inputs_and_labels(
        model_config: ScalarModelBase, model: torch.nn.Module,
        sample: Dict[str, Any]) -> ScalarModelInputsAndLabels:
    """
    For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader.
    :param model_config: The configuration object for the model.
    :param model: The instantiated PyTorch model.
    :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value)
    :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors,
    label tensor, subject IDs, and the data item reconstructed from the data loader output
    """
    if isinstance(model, DataParallelModel):
        model = model.get_module()

    if isinstance(model_config, SequenceModelBase):
        sequence_model: DeviceAwareModule[List[ClassificationItemSequence],
                                          torch.Tensor] = model  # type: ignore
        sequences = ClassificationItemSequence.from_minibatch(sample)
        subject_ids = [x.id for x in sequences]
        labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
            sequences=sequences,
            target_indices=model_config.get_target_indices())
        model_inputs = sequence_model.get_input_tensors(sequences)

        return ScalarModelInputsAndLabels[List[ClassificationItemSequence],
                                          torch.Tensor](
                                              model_inputs=model_inputs,
                                              labels=labels,
                                              subject_ids=subject_ids,
                                              data_item=sequences)
    else:
        scalar_model: DeviceAwareModule[ScalarItem,
                                        torch.Tensor] = model  # type: ignore
        scalar_item = ScalarItem.from_dict(sample)
        subject_ids = [str(x.id) for x in scalar_item.metadata]  # type: ignore
        model_inputs = scalar_model.get_input_tensors(scalar_item)

        return ScalarModelInputsAndLabels[ScalarItem, torch.Tensor](
            model_inputs=model_inputs,
            labels=scalar_item.label,
            subject_ids=subject_ids,
            data_item=scalar_item)
Exemple #7
0
def test_standardize_features() -> None:
    """
    Test if the non-image feature can be normalized to mean 0, std 1.
    :return:
    """
    set_random_seed(1234)
    expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]])
    expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]])
    feature_size = (2, 3)
    sequences: List[ClassificationItemSequence] = []
    for s in range(1000):
        items = []
        seq_length = torch.randint(low=3, high=6, size=(1, )).item()
        for i in range(seq_length):  # type: ignore
            # All features are random Gaussian, apart from feature 0 which is constant.
            # Normalization must be able to deal with constant features when dividing by standard deviation.
            features = torch.randn(size=feature_size, dtype=torch.float32
                                   ) * expected_std + expected_mean
            # Randomly put some infinite values in the vector
            features[s % 2, s %
                     3] = np.inf if torch.rand(1) > 0.9 else features[s % 2,
                                                                      s % 3]
            features[0, 0] = expected_mean[0, 0]
            item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"),
                              numerical_non_image_features=features,
                              categorical_non_image_features=features,
                              label=torch.tensor([]),
                              images=torch.tensor([]),
                              segmentations=torch.tensor([]))
            items.append(item)
        sequences.append(ClassificationItemSequence(id="foo", items=items))
    mean_std = FeatureStatistics.from_data_sources(sequences)
    assert mean_std.mean.shape == feature_size
    assert mean_std.std.shape == feature_size

    assert_tensors_equal(mean_std.mean, expected_mean, 0.07)
    assert_tensors_equal(mean_std.std, expected_std, 0.07)

    # After normalization, mean should be 0, and std should be 1.
    standardized_seq = mean_std.standardize(sequences)
    mean_std_from_standardized = FeatureStatistics.from_data_sources(
        standardized_seq)
    # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched,
    # hence its mean is the original feature value.
    expected_mean_from_standardized = torch.zeros(feature_size)
    expected_mean_from_standardized[0, 0] = expected_mean[0, 0]
    expected_std_from_standardized = torch.ones(feature_size)
    expected_std_from_standardized[0, 0] = 0.0
    assert_tensors_equal(mean_std_from_standardized.mean,
                         expected_mean_from_standardized,
                         abs=1e-5)
    assert_tensors_equal(mean_std_from_standardized.std,
                         expected_std_from_standardized,
                         abs=1e-5)
def get_scalar_model_inputs_and_labels(model: torch.nn.Module,
                                       target_indices: List[int],
                                       sample: Dict[str, Any]) -> ScalarModelInputsAndLabels:
    """
    For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader.
    :param model: The instantiated PyTorch model.
    :param target_indices: If this list is non-empty, assume that the model is a sequence model, and build the
    model inputs and labels for a model that predicts those specific positions in the sequence. If the list is empty,
    assume that the model is a normal (non-sequence) model.
    :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value)
    :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors,
    label tensor, subject IDs, and the data item reconstructed from the data loader output
    """
    if target_indices:
        sequence_model: DeviceAwareModule[List[ClassificationItemSequence], torch.Tensor] = model  # type: ignore
        sequences = ClassificationItemSequence.from_minibatch(sample)
        subject_ids = [x.id for x in sequences]
        labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
            sequences=sequences,
            target_indices=target_indices
        )
        model_inputs = sequence_model.get_input_tensors(sequences)

        return ScalarModelInputsAndLabels[List[ClassificationItemSequence]](
            model_inputs=model_inputs,
            labels=labels,
            subject_ids=subject_ids,
            data_item=sequences
        )
    else:
        scalar_model: DeviceAwareModule[ScalarItem, torch.Tensor] = model  # type: ignore
        scalar_item = ScalarItem.from_dict(sample)
        subject_ids = [str(x.id) for x in scalar_item.metadata]  # type: ignore
        model_inputs = scalar_model.get_input_tensors(scalar_item)

        return ScalarModelInputsAndLabels[ScalarItem](
            model_inputs=model_inputs,
            labels=scalar_item.label,
            subject_ids=subject_ids,
            data_item=scalar_item
        )
 def add_features(seq: ClassificationItemSequence) -> ClassificationItemSequence:
     items_mapped: List[SequenceDataSource] = []
     feature_baseline = None
     for item_index, item in enumerate(seq.items):
         if item_index == 0:
             feature_baseline = torch.stack([item.numerical_non_image_features[:, i] for i in feature_indices],
                                            dim=0)
         features_for_diff = torch.stack([item.numerical_non_image_features[:, i] for i in feature_indices], dim=0)
         diff = features_for_diff - feature_baseline
         new_features = torch.cat([item.numerical_non_image_features, diff.t()], dim=1)
         items_mapped.append(item.clone_with_overrides(numerical_non_image_features=new_features))
     return ClassificationItemSequence(id=seq.id, items=items_mapped)
Exemple #10
0
def test_seq_dataset_loader() -> None:
    dummy_dataset = full_ml_test_data_path(
    ) / "sequence_data_for_classification" / "dataset.csv"
    df = pd.read_csv(dummy_dataset, sep=",", dtype=str)
    dataset = SequenceDataset(args=SequenceModelBase(
        image_file_column="IMG",
        label_value_column="Label",
        numerical_columns=["NUM1", "NUM2", "NUM3", "NUM4"],
        sequence_target_positions=[8],
        sequence_column="Position",
        local_dataset=Path(),
        should_validate=False),
                              data_frame=df)
    assert len(dataset) == 2
    # Patch the load_images function that well be called once we access a dataset item
    with mock.patch('InnerEye.ML.dataset.scalar_sample.load_images_and_stack',
                    return_value=ImageAndSegmentations[torch.Tensor](
                        images=torch.ones(1), segmentations=torch.empty(0))):
        item0 = ClassificationItemSequence(**dataset[0])
        item1 = ClassificationItemSequence(**dataset[1])
        assert item0.id == "2627.00001"
        len_2627 = 3
        assert len(item0.items) == len_2627
        assert item1.id == "3250.00005"
        len_3250 = 9
        assert len(item1.items) == len_3250

        # Data loaders use a customized collate function, that must work with the sequences too.
        collated = collate_with_metadata([dataset[0], dataset[1]])
        assert collated["id"] == ["2627.00001", "3250.00005"]
        # All subject sequences should be turned into lists of lists.
        assert isinstance(collated["items"], list)
        assert len(collated["items"]) == 2
        assert isinstance(collated["items"][0], list)
        assert isinstance(collated["items"][1], list)
        assert len(collated["items"][0]) == len_2627
        assert len(collated["items"][1]) == len_3250
        back_to_items = ClassificationItemSequence(**collated)
        assert back_to_items.id == ["2627.00001", "3250.00005"]
Exemple #11
0
def test_standardize_features_when_singleton(is_sequence: bool) -> None:
    """
    Test how feature standardize copes with datasets that only have 1 entry.
    """
    numerical_features = torch.ones((1, 3))
    categorical_features = torch.tensor([[0, 1, 1], [1, 0, 0]])
    item: Union[SequenceDataSource, ScalarDataSource]
    sources: Union[ListOfSequences, List[ScalarDataSource]]
    if is_sequence:
        item = SequenceDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])
        sources = [ClassificationItemSequence(id="foo", items=[item])]
        mean_std = FeatureStatistics.from_data_sources(sources)
    else:
        item = ScalarDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])

        sources = [item]
        mean_std = FeatureStatistics.from_data_sources(sources)

    assert_tensors_equal(mean_std.mean, numerical_features)
    # Standard deviation can't be computed because there is only one element, hence becomes nan.
    assert torch.all(torch.isnan(mean_std.std))
    # When applying such a standardization to the sequences, they should not be changed (similar to features that
    # are constant)
    standardized_sources = mean_std.standardize(sources)
    if is_sequence:
        assert_tensors_equal(
            standardized_sources[0].items[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].items[0].categorical_non_image_features,
            categorical_features)
    else:
        assert_tensors_equal(
            standardized_sources[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].categorical_non_image_features,
            categorical_features)
 def __getitem__(self, i: int) -> Dict[str, Any]:
     loaded = list(map(self.load_item, self.items[i].items))
     return vars(
         ClassificationItemSequence(id=self.items[i].id, items=loaded))
Exemple #13
0
def test_sequence_dataset_all(test_output_dirs: OutputFolderForTests) -> None:
    """
    Check that the sequence dataset works end-to-end, including applying the right standardization.
    """
    csv_string = """subject,seq,value,scalar1,scalar2,META,BETA
S1,0,False,0,0,M1,B1
S1,1,True,1,10,M2,B2
S2,0,False,2,20,M2,B1
S3,0,True,3,30,M1,B1
S4,0,True,4,40,M2,B1
"""
    csv_path = create_dataset_csv_file(csv_string, test_output_dirs.root_dir)
    config = SequenceModelBase(local_dataset=csv_path,
                               image_file_column=None,
                               label_value_column="value",
                               numerical_columns=["scalar1", "scalar2"],
                               sequence_target_positions=[0],
                               categorical_columns=["META", "BETA"],
                               sequence_column="seq",
                               num_dataload_workers=0,
                               train_batch_size=2,
                               should_validate=False,
                               shuffle=False)
    config.read_dataset_if_needed()
    df = config.dataset_data_frame
    assert df is not None
    df1 = df[df.subject.isin(["S1", "S2"])]
    df2 = df[df.subject == "S3"]
    df3 = df[df.subject == "S4"]
    splits = DatasetSplits(train=df1, val=df2, test=df3)
    with mock.patch.object(SequenceModelBase,
                           'get_model_train_test_dataset_splits',
                           return_value=splits):
        train_val_loaders = config.create_data_loaders()
        # Expected feature mean: Mean of the training data (0, 0), (1, 10), (2, 20) = (1, 10)
        # Expected (biased corrected) std estimate: Std of (0, 0), (1, 10), (2, 20) = (1, 10)
        feature_stats = config.get_torch_dataset_for_inference(
            ModelExecutionMode.TRAIN).feature_statistics
        assert feature_stats is not None
        assert_tensors_equal(feature_stats.mean, [1, 10])
        assert_tensors_equal(feature_stats.std, [1, 10])

        train_items = list(
            ClassificationItemSequence.from_minibatch(b)
            for b in train_val_loaders[ModelExecutionMode.TRAIN])
        assert len(
            train_items
        ) == 1, "2 items in training set with batch size of 2 should return 1 minibatch"
        assert len(train_items[0]) == 2
        assert train_items[0][0].id == "S1"
        assert_tensors_equal(
            train_items[0][0].items[0].get_all_non_imaging_features(),
            [-1., -1., 1., 0., 1., 0.])
        assert_tensors_equal(
            train_items[0][0].items[1].get_all_non_imaging_features(),
            [0., 0., 0., 1., 0., 1.])
        assert train_items[0][1].id == "S2"
        assert_tensors_equal(
            train_items[0][1].items[0].get_all_non_imaging_features(),
            [1., 1., 0., 1., 1., 0.])
        val_items = list(
            ClassificationItemSequence.from_minibatch(b)
            for b in train_val_loaders[ModelExecutionMode.VAL])
        assert len(val_items) == 1
        assert len(val_items[0]) == 1
        assert val_items[0][0].id == "S3"
        # Items in the validation set should be normalized using the mean and std on the training data.
        # Hence, the non-image features (3, 30) should turn into (2, 2)
        assert_tensors_equal(
            val_items[0][0].items[0].get_all_non_imaging_features(),
            [2., 2., 1., 0., 1., 0.])

        # Check that the test set is also normalized correctly using the training mean and std.
        test_items = list(
            ClassificationItemSequence(**b) for b in
            config.get_torch_dataset_for_inference(ModelExecutionMode.TEST))
        assert test_items[0].id == "S4"
        # Check Non-image features of (4, 40)
        assert_tensors_equal(
            test_items[0].items[0].get_all_non_imaging_features(),
            [3., 3., 0., 1., 1., 0.])