def __call__(self, item: ScalarItem) -> ScalarItem: if self.transform.for_segmentation_input_maps: if item.segmentations is None: raise ValueError( "A segmentation data augmentation transform has been" "specified but no segmentations has been loaded.") return item.clone_with_overrides( segmentations=self.transform(item.segmentations)) else: return item.clone_with_overrides( images=self.transform(item.images))
def __call__(self, item: ScalarItem) -> ScalarItem: if self.image_transform is not None: if self.segmentation_transform is not None: return item.clone_with_overrides( images=self.image_transform(item.images), segmentations=self.segmentation_transform( item.segmentations)) return item.clone_with_overrides( images=self.image_transform(item.images)) if self.segmentation_transform is not None: item.clone_with_overrides( segmentations=self.segmentation_transform(item.segmentations)) return item
def get_input_tensors(self, item: ScalarItem) -> List[torch.Tensor]: """ Transforms a classification item into a torch.Tensor that the forward pass can consume :param item: ClassificationItem :return: Tensor """ use_gpu = self.is_model_on_gpu() result_dtype = torch.float16 if self.use_mixed_precision and use_gpu else torch.float32 if self.imaging_feature_type == ImagingFeatureType.Segmentation \ or self.imaging_feature_type == ImagingFeatureType.ImageAndSegmentation: if item.segmentations is None: raise ValueError("Expected item.segmentations to not be None") # Special case need for the loading of individual positions in the sequence model, # the images are loaded as [C, Z, X, Y] but the segmentation_to_one_hot expects [B, C, Z, X, Y] segmentation_multilabel = item.segmentations is_4dim = segmentation_multilabel.ndimension() == 4 if is_4dim: segmentation_multilabel = segmentation_multilabel.unsqueeze(dim=0) segmentation_one_hot = segmentation_to_one_hot(segmentation_multilabel, use_gpu=use_gpu, result_dtype=result_dtype) if is_4dim: segmentation_one_hot = segmentation_one_hot.squeeze(dim=0) input_tensors = [segmentation_one_hot] if self.imaging_feature_type == ImagingFeatureType.ImageAndSegmentation: input_tensors.append(item.images.to(dtype=result_dtype, copy=True)) _dim = 0 if item.images.ndimension() == 4 else 1 input_tensors = [torch.cat(input_tensors, dim=_dim)] else: input_tensors = [item.images.to(dtype=result_dtype, copy=True)] if self.image_and_non_image_features_aggregator: input_tensors.append(item.get_all_non_imaging_features()) return input_tensors
def _create(features: List) -> torch.Tensor: return ScalarItem( segmentations=torch.empty(0), metadata=GeneralSampleMetadata(id="foo"), images=torch.tensor([]), label=torch.tensor([]), categorical_non_image_features=torch.tensor(features).float(), numerical_non_image_features=torch.tensor( features).float()).get_all_non_imaging_features()
def __call__(self, item: ScalarItem) -> ScalarItem: return item.clone_with_overrides( images=torch.tensor(mri_window(image_in=item.images.numpy(), output_range=self.output_range, mask=None, sharpen=self.sharpen, tail=self.tail)[0], dtype=item.images.dtype, device=item.images.device))
def get_input_tensors(self, item: ScalarItem) -> List[torch.Tensor]: """ Transforms a classification item into a torch.Tensor that the forward pass can consume :param item: ClassificationItem :return: Tensor """ if item.images.numel() > 0: return [item.images] else: return [item.get_all_non_imaging_features()]
def _create_scalar_items(length: int, label_value: float = 1.0) -> List[ScalarItem]: return [ ScalarItem(metadata=GeneralSampleMetadata(id="foo", sequence_position=x), numerical_non_image_features=torch.tensor([]), categorical_non_image_features=torch.tensor([]), label=torch.tensor([label_value]), images=torch.tensor([]), segmentations=torch.tensor([])) for x in range(length) ]
def test_standardize_features() -> None: """ Test if the non-image feature can be normalized to mean 0, std 1. :return: """ set_random_seed(1234) expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]]) expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]]) feature_size = (2, 3) sequences: List[ClassificationItemSequence] = [] for s in range(1000): items = [] seq_length = torch.randint(low=3, high=6, size=(1, )).item() for i in range(seq_length): # type: ignore # All features are random Gaussian, apart from feature 0 which is constant. # Normalization must be able to deal with constant features when dividing by standard deviation. features = torch.randn(size=feature_size, dtype=torch.float32 ) * expected_std + expected_mean # Randomly put some infinite values in the vector features[s % 2, s % 3] = np.inf if torch.rand(1) > 0.9 else features[s % 2, s % 3] features[0, 0] = expected_mean[0, 0] item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=features, categorical_non_image_features=features, label=torch.tensor([]), images=torch.tensor([]), segmentations=torch.tensor([])) items.append(item) sequences.append(ClassificationItemSequence(id="foo", items=items)) mean_std = FeatureStatistics.from_data_sources(sequences) assert mean_std.mean.shape == feature_size assert mean_std.std.shape == feature_size assert_tensors_equal(mean_std.mean, expected_mean, 0.07) assert_tensors_equal(mean_std.std, expected_std, 0.07) # After normalization, mean should be 0, and std should be 1. standardized_seq = mean_std.standardize(sequences) mean_std_from_standardized = FeatureStatistics.from_data_sources( standardized_seq) # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched, # hence its mean is the original feature value. expected_mean_from_standardized = torch.zeros(feature_size) expected_mean_from_standardized[0, 0] = expected_mean[0, 0] expected_std_from_standardized = torch.ones(feature_size) expected_std_from_standardized[0, 0] = 0.0 assert_tensors_equal(mean_std_from_standardized.mean, expected_mean_from_standardized, abs=1e-5) assert_tensors_equal(mean_std_from_standardized.std, expected_std_from_standardized, abs=1e-5)
def get_scalar_model_inputs_and_labels( model_config: ScalarModelBase, model: torch.nn.Module, sample: Dict[str, Any]) -> ScalarModelInputsAndLabels: """ For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader. :param model_config: The configuration object for the model. :param model: The instantiated PyTorch model. :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value) :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors, label tensor, subject IDs, and the data item reconstructed from the data loader output """ if isinstance(model, DataParallelModel): model = model.get_module() if isinstance(model_config, SequenceModelBase): sequence_model: DeviceAwareModule[List[ClassificationItemSequence], torch.Tensor] = model # type: ignore sequences = ClassificationItemSequence.from_minibatch(sample) subject_ids = [x.id for x in sequences] labels = ClassificationItemSequence.create_labels_tensor_for_minibatch( sequences=sequences, target_indices=model_config.get_target_indices()) model_inputs = sequence_model.get_input_tensors(sequences) return ScalarModelInputsAndLabels[List[ClassificationItemSequence], torch.Tensor]( model_inputs=model_inputs, labels=labels, subject_ids=subject_ids, data_item=sequences) else: scalar_model: DeviceAwareModule[ScalarItem, torch.Tensor] = model # type: ignore scalar_item = ScalarItem.from_dict(sample) subject_ids = [str(x.id) for x in scalar_item.metadata] # type: ignore model_inputs = scalar_model.get_input_tensors(scalar_item) return ScalarModelInputsAndLabels[ScalarItem, torch.Tensor]( model_inputs=model_inputs, labels=scalar_item.label, subject_ids=subject_ids, data_item=scalar_item)
def get_scalar_model_inputs_and_labels(model: torch.nn.Module, target_indices: List[int], sample: Dict[str, Any]) -> ScalarModelInputsAndLabels: """ For a model that predicts scalars, gets the model input tensors from a sample returned by the data loader. :param model: The instantiated PyTorch model. :param target_indices: If this list is non-empty, assume that the model is a sequence model, and build the model inputs and labels for a model that predicts those specific positions in the sequence. If the list is empty, assume that the model is a normal (non-sequence) model. :param sample: A training sample, as returned by a PyTorch data loader (dictionary mapping from field name to value) :return: An instance of ScalarModelInputsAndLabels, containing the list of model input tensors, label tensor, subject IDs, and the data item reconstructed from the data loader output """ if target_indices: sequence_model: DeviceAwareModule[List[ClassificationItemSequence], torch.Tensor] = model # type: ignore sequences = ClassificationItemSequence.from_minibatch(sample) subject_ids = [x.id for x in sequences] labels = ClassificationItemSequence.create_labels_tensor_for_minibatch( sequences=sequences, target_indices=target_indices ) model_inputs = sequence_model.get_input_tensors(sequences) return ScalarModelInputsAndLabels[List[ClassificationItemSequence]]( model_inputs=model_inputs, labels=labels, subject_ids=subject_ids, data_item=sequences ) else: scalar_model: DeviceAwareModule[ScalarItem, torch.Tensor] = model # type: ignore scalar_item = ScalarItem.from_dict(sample) subject_ids = [str(x.id) for x in scalar_item.metadata] # type: ignore model_inputs = scalar_model.get_input_tensors(scalar_item) return ScalarModelInputsAndLabels[ScalarItem]( model_inputs=model_inputs, labels=scalar_item.label, subject_ids=subject_ids, data_item=scalar_item )
def test_multi_segmentation_encoder() -> None: scan_size = (25, 33, 65) batch_size = 3 num_image_channels = 2 encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels, encode_channels_jointly=True) x = torch.ones((batch_size, num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES) + scan_size) y = encoder.encode_and_aggregate(x) final_output_channels = _expected_output_channels( num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES) assert y.size() == (batch_size, final_output_channels, 1, 1, 1) full_output = encoder(x) assert full_output.size() == (batch_size, 1) encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels, encode_channels_jointly=False) x = torch.ones((batch_size, num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES) + scan_size) y = encoder.encode_and_aggregate(x) final_output_channels = _expected_output_channels( HDF5_NUM_SEGMENTATION_CLASSES) # Each image channel generates 7 features, we concatenate those 7 features for the 2 image channels assert y.size() == (batch_size, final_output_channels * 2, 1, 1, 1) full_output = encoder(x) assert full_output.size() == (batch_size, 1) # Test that the encoder can correctly convert from a scalar data item to the one-hot encoded model input tensor scalar_item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"), label=torch.empty(1), numerical_non_image_features=torch.empty(1), categorical_non_image_features=torch.empty(1), images=torch.empty(1), segmentations=torch.ones( (batch_size, num_image_channels, *scan_size))) input_tensors = encoder.get_input_tensors(scalar_item) assert len(input_tensors) == 1 assert input_tensors[0].shape == (batch_size, HDF5_NUM_SEGMENTATION_CLASSES * num_image_channels, *scan_size)
def test_dataloader_speed(test_output_dirs: OutputFolderForTests, num_dataload_workers: int, shuffle: bool) -> None: """ Test how dataloaders work when using multiple processes. """ ml_util.set_random_seed(0) # The dataset should only contain the file name stem, without extension. csv_string = StringIO("""subject,channel,path,value,scalar1 S1,image,4be9beed-5861-fdd2-72c2-8dd89aadc1ef S1,label,,True,1.0 S2,image,6ceacaf8-abd2-ffec-2ade-d52afd6dd1be S2,label,,True,2.0 S3,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4 S3,label,,False,3.0 S4,image,61bc9d73-9fbb-bd7d-c06b-eeffbafabcc4 S4,label,,False,3.0 """) args = ScalarModelBase(image_channels=[], label_channels=["label"], label_value_column="value", non_image_feature_channels=["label"], numerical_columns=["scalar1"], num_dataload_workers=num_dataload_workers, num_dataset_reader_workers=num_dataload_workers, avoid_process_spawn_in_data_loaders=True, should_validate=False) dataset = ScalarDataset(args, data_frame=pd.read_csv(csv_string, dtype=str)) assert len(dataset) == 4 num_epochs = 2 total_start_time = time.time() loader = dataset.as_data_loader(shuffle=shuffle, batch_size=1) # The order in which items are expected in each epoch, when applying shuffling, and using 1 dataloader worker # This was determined before making any changes to the dataloader logic # (that is, when the as_data_loader method returns an instance of DataLoader, rather than RepeatDataLoader) expected_item_order = [ ["S2", "S1", "S4", "S3"], ["S4", "S3", "S1", "S2"], ] for epoch in range(num_epochs): actual_item_order = [] print(f"Starting epoch {epoch}") epoch_start_time = time.time() item_start_time = time.time() for i, item_dict in enumerate(loader): item_load_time = time.time() - item_start_time item = ScalarItem.from_dict(item_dict) # noinspection PyTypeChecker sample_id = item.metadata[0].id # type: ignore print( f"Loading item {i} with ID = {sample_id} in {item_load_time:0.8f} sec" ) if shuffle: actual_item_order.append(sample_id) else: assert sample_id == f"S{i + 1}" if not (epoch == 0 and i == 0): assert item_load_time < 0.1, f"We should only see significant item load times in the first batch " \ f"of the first epoch, but got loading time of {item_load_time:0.2f} sec" \ f" in epoch {epoch} batch {i}" # Sleep a bit so that the worker process can fill in items if num_dataload_workers > 0: time.sleep(0.05) item_start_time = time.time() if shuffle and num_dataload_workers == 1: assert actual_item_order == expected_item_order[ epoch], f"Item in wrong order for epoch {epoch}" total_epoch_time = time.time() - epoch_start_time print(f"Total time for epoch {epoch}: {total_epoch_time} sec") total_time = time.time() - total_start_time print(f"Total time for all epochs: {total_time} sec")