Beispiel #1
0
    def create_torch_datasets(
            self,
            dataset_splits: DatasetSplits) -> Dict[ModelExecutionMode, Any]:
        from InnerEye.ML.dataset.sequence_dataset import SequenceDataset
        sample_transform = self.get_scalar_item_transform()
        assert sample_transform.train is not None  # for mypy
        assert sample_transform.val is not None  # for mypy
        assert sample_transform.test is not None  # for mypy

        train = SequenceDataset(self,
                                dataset_splits.train,
                                name="training",
                                sample_transform=sample_transform.train)
        val = SequenceDataset(self,
                              dataset_splits.val,
                              feature_statistics=train.feature_statistics,
                              name="validation",
                              sample_transform=sample_transform.val)
        test = SequenceDataset(self,
                               dataset_splits.test,
                               feature_statistics=train.feature_statistics,
                               name="test",
                               sample_transform=sample_transform.test)

        return {
            ModelExecutionMode.TRAIN: train,
            ModelExecutionMode.VAL: val,
            ModelExecutionMode.TEST: test
        }
    def create_torch_datasets(
            self,
            dataset_splits: DatasetSplits) -> Dict[ModelExecutionMode, Any]:
        from InnerEye.ML.dataset.sequence_dataset import SequenceDataset
        sample_transforms = self.get_image_sample_transforms()
        train = SequenceDataset(
            self,
            dataset_splits.train,
            name="training",
            sample_transforms=sample_transforms.train)  # type: ignore
        val = SequenceDataset(
            self,
            dataset_splits.val,
            feature_statistics=train.feature_statistics,
            name="validation",
            sample_transforms=sample_transforms.val)  # type: ignore
        test = SequenceDataset(
            self,
            dataset_splits.test,
            feature_statistics=train.feature_statistics,
            name="test",
            sample_transforms=sample_transforms.test)  # type: ignore

        return {
            ModelExecutionMode.TRAIN: train,
            ModelExecutionMode.VAL: val,
            ModelExecutionMode.TEST: test
        }
Beispiel #3
0
def test_visualization_for_different_target_weeks(test_output_dirs: TestOutputDirectories) -> None:
    """
    Tests that the visualizations are differentiated depending on the target week
    for which we visualize it.
    """
    config = ToyMultiLabelSequenceModel(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.dataset_data_frame = _get_multi_label_sequence_dataframe()
    config.pre_process_dataset_dataframe()
    model = create_model_with_temperature_scaling(config)
    dataloader = SequenceDataset(config,
                                 data_frame=config.dataset_data_frame).as_data_loader(shuffle=False,
                                                                                      batch_size=2)
    batch = next(iter(dataloader))
    model_inputs_and_labels = get_scalar_model_inputs_and_labels(config, model, batch)  # type: ignore

    visualizer = VisualizationMaps(model, config)
    # Pseudo-grad cam explaining the prediction at target sequence 2
    _, _, pseudo_cam_non_img_3, probas_3 = visualizer.generate(model_inputs_and_labels.model_inputs,
                                                               target_position=2,
                                                               target_label_index=2)
    # Pseudo-grad cam explaining the prediction at target sequence 0
    _, _, pseudo_cam_non_img_1, probas_1 = visualizer.generate(model_inputs_and_labels.model_inputs,
                                                               target_position=0,
                                                               target_label_index=0)
    assert pseudo_cam_non_img_1.shape[1] == 1
    assert pseudo_cam_non_img_3.shape[1] == 3
    # Both visualizations should not be equal
    assert np.any(pseudo_cam_non_img_1 != pseudo_cam_non_img_3)
    assert np.any(probas_3 != probas_1)
def test_visualization_with_sequence_model(
        use_combined_model: bool, imaging_feature_type: ImagingFeatureType,
        test_output_dirs: OutputFolderForTests) -> None:
    config = ToySequenceModel(use_combined_model,
                              imaging_feature_type,
                              should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.dataset_data_frame = _get_mock_sequence_dataset()
    config.num_epochs = 1
    model = config.create_model()
    if config.use_gpu:
        model = model.cuda()
    dataloader = SequenceDataset(
        config,
        data_frame=config.dataset_data_frame).as_data_loader(shuffle=False,
                                                             batch_size=2)
    # Patch the load_images function that will be called once we access a dataset item
    image_and_seg = ImageAndSegmentations[np.ndarray](
        images=np.random.uniform(0, 1, SCAN_SIZE),
        segmentations=np.random.randint(0, 2, SCAN_SIZE))
    with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats',
                    return_value=image_and_seg):
        batch = next(iter(dataloader))
        if config.use_gpu:
            batch = transfer_batch_to_device(batch, torch.device(0))
        model_inputs_and_labels = get_scalar_model_inputs_and_labels(
            model, target_indices=config.get_target_indices(),
            sample=batch)  # type: ignore
    number_sequences = model_inputs_and_labels.model_inputs[0].shape[1]
    number_subjects = len(model_inputs_and_labels.subject_ids)
    visualizer = VisualizationMaps(model, config)
    guided_grad_cams, grad_cams, pseudo_cam_non_img, probas = visualizer.generate(
        model_inputs_and_labels.model_inputs)
    if use_combined_model:
        if imaging_feature_type == ImagingFeatureType.ImageAndSegmentation:
            assert guided_grad_cams.shape[:2] == (number_subjects,
                                                  number_sequences * 2)
            assert grad_cams.shape[:2] == (number_subjects,
                                           number_sequences * 2)
        else:
            assert guided_grad_cams.shape[:2] == (number_subjects,
                                                  number_sequences)
            assert grad_cams.shape[:2] == (number_subjects, number_sequences)
    else:
        assert guided_grad_cams is None
        assert grad_cams is None
        assert pseudo_cam_non_img.shape[:2] == (number_subjects,
                                                number_sequences)
        assert probas.shape[0] == number_subjects
    non_image_features = config.numerical_columns + config.categorical_columns
    non_imaging_plot_labels = visualizer._get_non_imaging_plot_labels(
        model_inputs_and_labels.data_item,
        non_image_features,
        index=0,
        target_position=3)
    assert non_imaging_plot_labels == [
        'numerical1_0', 'numerical2_0', 'cat1_0', 'numerical1_1',
        'numerical2_1', 'cat1_1', 'numerical1_2', 'numerical2_2', 'cat1_2',
        'numerical1_3', 'numerical2_3', 'cat1_3'
    ]
Beispiel #5
0
def test_sequence_dataloader() -> None:
    """
    Test if we can create a data loader from the dataset, and recover the items as expected in batched form.
    Including instances where not all elements of the sequence have labels.
    """
    csv_string = StringIO("""subject,seq,path,value,scalar1,scalar2,META
S1,0,foo.nii,,0,0,M1
S1,1,,True,1.1,1.2,M2
S2,0,bar.nii,False,2.1,2.2,M3
S2,1,,False,2.0,2.0,M4
""")
    df = pd.read_csv(csv_string, sep=",", dtype=str)
    config = SequenceModelBase(image_file_column=None,
                               label_value_column="value",
                               numerical_columns=["scalar1"],
                               sequence_target_positions=[1],
                               sequence_column="seq",
                               local_dataset=Path.cwd(),
                               should_validate=False)
    dataset = SequenceDataset(config, data_frame=df)
    assert len(dataset) == 2
    data_loader = dataset.as_data_loader(shuffle=False,
                                         batch_size=2,
                                         num_dataload_workers=0)
    # We have 2 subjects, with a batch size of 2 those should be turned into 1 batch
    data_loader_output = list(i for i in data_loader)
    assert len(data_loader_output) == 1
    loaded = list(ClassificationItemSequence(**i) for i in data_loader_output)
    assert loaded[0].id == ["S1", "S2"]
    assert isinstance(loaded[0].items[0][0], ScalarItem)
    assert loaded[0].items[0][0].metadata.id == "S1"
    assert loaded[0].items[0][1].metadata.id == "S1"
    assert loaded[0].items[1][0].metadata.id == "S2"
    assert loaded[0].items[1][1].metadata.id == "S2"

    # The batched sequence data are awkward to work with. Check if we can un-roll them correctly via
    # from_minibatch
    un_batched = ClassificationItemSequence.from_minibatch(
        data_loader_output[0])
    assert len(un_batched) == 2
    for i in range(2):
        assert un_batched[i].id == dataset.items[i].id
        assert len(un_batched[i].items) == len(dataset.items[i].items)
        for j in range(len(un_batched[i].items)):
            assert un_batched[i].items[j].metadata.id == dataset.items[
                i].items[j].metadata.id
Beispiel #6
0
def test_seq_dataset_loader() -> None:
    dummy_dataset = full_ml_test_data_path(
    ) / "sequence_data_for_classification" / "dataset.csv"
    df = pd.read_csv(dummy_dataset, sep=",", dtype=str)
    dataset = SequenceDataset(args=SequenceModelBase(
        image_file_column="IMG",
        label_value_column="Label",
        numerical_columns=["NUM1", "NUM2", "NUM3", "NUM4"],
        sequence_target_positions=[8],
        sequence_column="Position",
        local_dataset=Path(),
        should_validate=False),
                              data_frame=df)
    assert len(dataset) == 2
    # Patch the load_images function that well be called once we access a dataset item
    with mock.patch('InnerEye.ML.dataset.scalar_sample.load_images_and_stack',
                    return_value=ImageAndSegmentations[torch.Tensor](
                        images=torch.ones(1), segmentations=torch.empty(0))):
        item0 = ClassificationItemSequence(**dataset[0])
        item1 = ClassificationItemSequence(**dataset[1])
        assert item0.id == "2627.00001"
        len_2627 = 3
        assert len(item0.items) == len_2627
        assert item1.id == "3250.00005"
        len_3250 = 9
        assert len(item1.items) == len_3250

        # Data loaders use a customized collate function, that must work with the sequences too.
        collated = collate_with_metadata([dataset[0], dataset[1]])
        assert collated["id"] == ["2627.00001", "3250.00005"]
        # All subject sequences should be turned into lists of lists.
        assert isinstance(collated["items"], list)
        assert len(collated["items"]) == 2
        assert isinstance(collated["items"][0], list)
        assert isinstance(collated["items"][1], list)
        assert len(collated["items"][0]) == len_2627
        assert len(collated["items"][1]) == len_3250
        back_to_items = ClassificationItemSequence(**collated)
        assert back_to_items.id == ["2627.00001", "3250.00005"]