def __call__(self, item: ScalarItem) -> ScalarItem:
     if self.transform.for_segmentation_input_maps:
         if item.segmentations is None:
             raise ValueError(
                 "A segmentation data augmentation transform has been"
                 "specified but no segmentations has been loaded.")
         return item.clone_with_overrides(
             segmentations=self.transform(item.segmentations))
     else:
         return item.clone_with_overrides(
             images=self.transform(item.images))
Beispiel #2
0
    def __call__(self, item: ScalarItem) -> ScalarItem:
        if self.image_transform is not None:
            if self.segmentation_transform is not None:
                return item.clone_with_overrides(
                    images=self.image_transform(item.images),
                    segmentations=self.segmentation_transform(
                        item.segmentations))
            return item.clone_with_overrides(
                images=self.image_transform(item.images))

        if self.segmentation_transform is not None:
            item.clone_with_overrides(
                segmentations=self.segmentation_transform(item.segmentations))
        return item
    def get_input_tensors(self, item: ScalarItem) -> List[torch.Tensor]:
        """
        Transforms a classification item into a torch.Tensor that the forward pass can consume
        :param item: ClassificationItem
        :return: Tensor
        """
        use_gpu = self.is_model_on_gpu()
        result_dtype = torch.float16 if self.use_mixed_precision and use_gpu else torch.float32
        if self.imaging_feature_type == ImagingFeatureType.Segmentation \
                or self.imaging_feature_type == ImagingFeatureType.ImageAndSegmentation:
            if item.segmentations is None:
                raise ValueError("Expected item.segmentations to not be None")
            # Special case need for the loading of individual positions in the sequence model,
            # the images are loaded as [C, Z, X, Y] but the segmentation_to_one_hot expects [B, C, Z, X, Y]
            segmentation_multilabel = item.segmentations
            is_4dim = segmentation_multilabel.ndimension() == 4
            if is_4dim:
                segmentation_multilabel = segmentation_multilabel.unsqueeze(dim=0)
            segmentation_one_hot = segmentation_to_one_hot(segmentation_multilabel,
                                                           use_gpu=use_gpu,
                                                           result_dtype=result_dtype)
            if is_4dim:
                segmentation_one_hot = segmentation_one_hot.squeeze(dim=0)
            input_tensors = [segmentation_one_hot]

            if self.imaging_feature_type == ImagingFeatureType.ImageAndSegmentation:
                input_tensors.append(item.images.to(dtype=result_dtype, copy=True))
                _dim = 0 if item.images.ndimension() == 4 else 1
                input_tensors = [torch.cat(input_tensors, dim=_dim)]
        else:
            input_tensors = [item.images.to(dtype=result_dtype, copy=True)]

        if self.image_and_non_image_features_aggregator:
            input_tensors.append(item.get_all_non_imaging_features())
        return input_tensors
Beispiel #4
0
 def _create(features: List) -> torch.Tensor:
     return ScalarItem(
         segmentations=torch.empty(0),
         metadata=GeneralSampleMetadata(id="foo"),
         images=torch.tensor([]),
         label=torch.tensor([]),
         categorical_non_image_features=torch.tensor(features).float(),
         numerical_non_image_features=torch.tensor(
             features).float()).get_all_non_imaging_features()
 def __call__(self, item: ScalarItem) -> ScalarItem:
     return item.clone_with_overrides(
         images=torch.tensor(mri_window(image_in=item.images.numpy(),
                                        output_range=self.output_range,
                                        mask=None,
                                        sharpen=self.sharpen,
                                        tail=self.tail)[0],
                             dtype=item.images.dtype,
                             device=item.images.device))
 def get_input_tensors(self, item: ScalarItem) -> List[torch.Tensor]:
     """
     Transforms a classification item into a torch.Tensor that the forward pass can consume
     :param item: ClassificationItem
     :return: Tensor
     """
     if item.images.numel() > 0:
         return [item.images]
     else:
         return [item.get_all_non_imaging_features()]
Beispiel #7
0
def _create_scalar_items(length: int,
                         label_value: float = 1.0) -> List[ScalarItem]:
    return [
        ScalarItem(metadata=GeneralSampleMetadata(id="foo",
                                                  sequence_position=x),
                   numerical_non_image_features=torch.tensor([]),
                   categorical_non_image_features=torch.tensor([]),
                   label=torch.tensor([label_value]),
                   images=torch.tensor([]),
                   segmentations=torch.tensor([])) for x in range(length)
    ]
Beispiel #8
0
def test_standardize_features() -> None:
    """
    Test if the non-image feature can be normalized to mean 0, std 1.
    :return:
    """
    set_random_seed(1234)
    expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]])
    expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]])
    feature_size = (2, 3)
    sequences: List[ClassificationItemSequence] = []
    for s in range(1000):
        items = []
        seq_length = torch.randint(low=3, high=6, size=(1, )).item()
        for i in range(seq_length):  # type: ignore
            # All features are random Gaussian, apart from feature 0 which is constant.
            # Normalization must be able to deal with constant features when dividing by standard deviation.
            features = torch.randn(size=feature_size, dtype=torch.float32
                                   ) * expected_std + expected_mean
            # Randomly put some infinite values in the vector
            features[s % 2, s %
                     3] = np.inf if torch.rand(1) > 0.9 else features[s % 2,
                                                                      s % 3]
            features[0, 0] = expected_mean[0, 0]
            item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"),
                              numerical_non_image_features=features,
                              categorical_non_image_features=features,
                              label=torch.tensor([]),
                              images=torch.tensor([]),
                              segmentations=torch.tensor([]))
            items.append(item)
        sequences.append(ClassificationItemSequence(id="foo", items=items))
    mean_std = FeatureStatistics.from_data_sources(sequences)
    assert mean_std.mean.shape == feature_size
    assert mean_std.std.shape == feature_size

    assert_tensors_equal(mean_std.mean, expected_mean, 0.07)
    assert_tensors_equal(mean_std.std, expected_std, 0.07)

    # After normalization, mean should be 0, and std should be 1.
    standardized_seq = mean_std.standardize(sequences)
    mean_std_from_standardized = FeatureStatistics.from_data_sources(
        standardized_seq)
    # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched,
    # hence its mean is the original feature value.
    expected_mean_from_standardized = torch.zeros(feature_size)
    expected_mean_from_standardized[0, 0] = expected_mean[0, 0]
    expected_std_from_standardized = torch.ones(feature_size)
    expected_std_from_standardized[0, 0] = 0.0
    assert_tensors_equal(mean_std_from_standardized.mean,
                         expected_mean_from_standardized,
                         abs=1e-5)
    assert_tensors_equal(mean_std_from_standardized.std,
                         expected_std_from_standardized,
                         abs=1e-5)
Beispiel #9
0
def get_scalar_model_inputs_and_labels(
        model_config: ScalarModelBase, model: torch.nn.Module,
        sample: Dict[str, Any]) -> ScalarModelInputsAndLabels:
    """
    For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader.
    :param model_config: The configuration object for the model.
    :param model: The instantiated PyTorch model.
    :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value)
    :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors,
    label tensor, subject IDs, and the data item reconstructed from the data loader output
    """
    if isinstance(model, DataParallelModel):
        model = model.get_module()

    if isinstance(model_config, SequenceModelBase):
        sequence_model: DeviceAwareModule[List[ClassificationItemSequence],
                                          torch.Tensor] = model  # type: ignore
        sequences = ClassificationItemSequence.from_minibatch(sample)
        subject_ids = [x.id for x in sequences]
        labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
            sequences=sequences,
            target_indices=model_config.get_target_indices())
        model_inputs = sequence_model.get_input_tensors(sequences)

        return ScalarModelInputsAndLabels[List[ClassificationItemSequence],
                                          torch.Tensor](
                                              model_inputs=model_inputs,
                                              labels=labels,
                                              subject_ids=subject_ids,
                                              data_item=sequences)
    else:
        scalar_model: DeviceAwareModule[ScalarItem,
                                        torch.Tensor] = model  # type: ignore
        scalar_item = ScalarItem.from_dict(sample)
        subject_ids = [str(x.id) for x in scalar_item.metadata]  # type: ignore
        model_inputs = scalar_model.get_input_tensors(scalar_item)

        return ScalarModelInputsAndLabels[ScalarItem, torch.Tensor](
            model_inputs=model_inputs,
            labels=scalar_item.label,
            subject_ids=subject_ids,
            data_item=scalar_item)
def get_scalar_model_inputs_and_labels(model: torch.nn.Module,
                                       target_indices: List[int],
                                       sample: Dict[str, Any]) -> ScalarModelInputsAndLabels:
    """
    For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader.
    :param model: The instantiated PyTorch model.
    :param target_indices: If this list is non-empty, assume that the model is a sequence model, and build the
    model inputs and labels for a model that predicts those specific positions in the sequence. If the list is empty,
    assume that the model is a normal (non-sequence) model.
    :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value)
    :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors,
    label tensor, subject IDs, and the data item reconstructed from the data loader output
    """
    if target_indices:
        sequence_model: DeviceAwareModule[List[ClassificationItemSequence], torch.Tensor] = model  # type: ignore
        sequences = ClassificationItemSequence.from_minibatch(sample)
        subject_ids = [x.id for x in sequences]
        labels = ClassificationItemSequence.create_labels_tensor_for_minibatch(
            sequences=sequences,
            target_indices=target_indices
        )
        model_inputs = sequence_model.get_input_tensors(sequences)

        return ScalarModelInputsAndLabels[List[ClassificationItemSequence]](
            model_inputs=model_inputs,
            labels=labels,
            subject_ids=subject_ids,
            data_item=sequences
        )
    else:
        scalar_model: DeviceAwareModule[ScalarItem, torch.Tensor] = model  # type: ignore
        scalar_item = ScalarItem.from_dict(sample)
        subject_ids = [str(x.id) for x in scalar_item.metadata]  # type: ignore
        model_inputs = scalar_model.get_input_tensors(scalar_item)

        return ScalarModelInputsAndLabels[ScalarItem](
            model_inputs=model_inputs,
            labels=scalar_item.label,
            subject_ids=subject_ids,
            data_item=scalar_item
        )
def test_multi_segmentation_encoder() -> None:
    scan_size = (25, 33, 65)
    batch_size = 3
    num_image_channels = 2
    encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels,
                                       encode_channels_jointly=True)
    x = torch.ones((batch_size, num_image_channels *
                    HDF5_NUM_SEGMENTATION_CLASSES) + scan_size)
    y = encoder.encode_and_aggregate(x)
    final_output_channels = _expected_output_channels(
        num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES)
    assert y.size() == (batch_size, final_output_channels, 1, 1, 1)
    full_output = encoder(x)
    assert full_output.size() == (batch_size, 1)
    encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels,
                                       encode_channels_jointly=False)
    x = torch.ones((batch_size, num_image_channels *
                    HDF5_NUM_SEGMENTATION_CLASSES) + scan_size)
    y = encoder.encode_and_aggregate(x)
    final_output_channels = _expected_output_channels(
        HDF5_NUM_SEGMENTATION_CLASSES)
    # Each image channel generates 7 features, we concatenate those 7 features for the 2 image channels
    assert y.size() == (batch_size, final_output_channels * 2, 1, 1, 1)
    full_output = encoder(x)
    assert full_output.size() == (batch_size, 1)
    # Test that the encoder can correctly convert from a scalar data item to the one-hot encoded model input tensor
    scalar_item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"),
                             label=torch.empty(1),
                             numerical_non_image_features=torch.empty(1),
                             categorical_non_image_features=torch.empty(1),
                             images=torch.empty(1),
                             segmentations=torch.ones(
                                 (batch_size, num_image_channels, *scan_size)))
    input_tensors = encoder.get_input_tensors(scalar_item)
    assert len(input_tensors) == 1
    assert input_tensors[0].shape == (batch_size,
                                      HDF5_NUM_SEGMENTATION_CLASSES *
                                      num_image_channels, *scan_size)
def test_dataloader_speed(test_output_dirs: OutputFolderForTests,
                          num_dataload_workers: int, shuffle: bool) -> None:
    """
    Test how dataloaders work when using multiple processes.
    """
    ml_util.set_random_seed(0)
    # The dataset should only contain the file name stem, without extension.
    csv_string = StringIO("""subject,channel,path,value,scalar1
S1,image,4be9beed-5861-fdd2-72c2-8dd89aadc1ef
S1,label,,True,1.0
S2,image,6ceacaf8-abd2-ffec-2ade-d52afd6dd1be
S2,label,,True,2.0
S3,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4
S3,label,,False,3.0
S4,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4
S4,label,,False,3.0
""")
    args = ScalarModelBase(image_channels=[],
                           label_channels=["label"],
                           label_value_column="value",
                           non_image_feature_channels=["label"],
                           numerical_columns=["scalar1"],
                           num_dataload_workers=num_dataload_workers,
                           num_dataset_reader_workers=num_dataload_workers,
                           avoid_process_spawn_in_data_loaders=True,
                           should_validate=False)
    dataset = ScalarDataset(args,
                            data_frame=pd.read_csv(csv_string, dtype=str))
    assert len(dataset) == 4
    num_epochs = 2
    total_start_time = time.time()
    loader = dataset.as_data_loader(shuffle=shuffle, batch_size=1)
    # The order in which items are expected in each epoch, when applying shuffling, and using 1 dataloader worker
    # This was determined before making any changes to the dataloader logic
    # (that is, when the as_data_loader method returns an instance of DataLoader, rather than RepeatDataLoader)
    expected_item_order = [
        ["S2", "S1", "S4", "S3"],
        ["S4", "S3", "S1", "S2"],
    ]
    for epoch in range(num_epochs):
        actual_item_order = []
        print(f"Starting epoch {epoch}")
        epoch_start_time = time.time()
        item_start_time = time.time()
        for i, item_dict in enumerate(loader):
            item_load_time = time.time() - item_start_time
            item = ScalarItem.from_dict(item_dict)
            # noinspection PyTypeChecker
            sample_id = item.metadata[0].id  # type: ignore
            print(
                f"Loading item {i} with ID = {sample_id} in {item_load_time:0.8f} sec"
            )
            if shuffle:
                actual_item_order.append(sample_id)
            else:
                assert sample_id == f"S{i + 1}"
            if not (epoch == 0 and i == 0):
                assert item_load_time < 0.1, f"We should only see significant item load times in the first batch " \
                                             f"of the first epoch, but got loading time of {item_load_time:0.2f} sec" \
                                             f" in epoch {epoch} batch {i}"
            # Sleep a bit so that the worker process can fill in items
            if num_dataload_workers > 0:
                time.sleep(0.05)
            item_start_time = time.time()
        if shuffle and num_dataload_workers == 1:
            assert actual_item_order == expected_item_order[
                epoch], f"Item in wrong order for epoch {epoch}"
        total_epoch_time = time.time() - epoch_start_time
        print(f"Total time for epoch {epoch}: {total_epoch_time} sec")
    total_time = time.time() - total_start_time
    print(f"Total time for all epochs: {total_time} sec")