def test_standardize_features_when_singleton(is_sequence: bool) -> None: """ Test how feature standardize copes with datasets that only have 1 entry. """ numerical_features = torch.ones((1, 3)) categorical_features = torch.tensor([[0, 1, 1], [1, 0, 0]]) item: Union[SequenceDataSource, ScalarDataSource] sources: Union[ListOfSequences, List[ScalarDataSource]] if is_sequence: item = SequenceDataSource( metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=numerical_features, categorical_non_image_features=categorical_features, label=torch.tensor([]), channel_files=[]) sources = [ClassificationItemSequence(id="foo", items=[item])] mean_std = FeatureStatistics.from_data_sources(sources) else: item = ScalarDataSource( metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=numerical_features, categorical_non_image_features=categorical_features, label=torch.tensor([]), channel_files=[]) sources = [item] mean_std = FeatureStatistics.from_data_sources(sources) assert_tensors_equal(mean_std.mean, numerical_features) # Standard deviation can't be computed because there is only one element, hence becomes nan. assert torch.all(torch.isnan(mean_std.std)) # When applying such a standardization to the sequences, they should not be changed (similar to features that # are constant) standardized_sources = mean_std.standardize(sources) if is_sequence: assert_tensors_equal( standardized_sources[0].items[0].numerical_non_image_features, numerical_features) assert_tensors_equal( standardized_sources[0].items[0].categorical_non_image_features, categorical_features) else: assert_tensors_equal( standardized_sources[0].numerical_non_image_features, numerical_features) assert_tensors_equal( standardized_sources[0].categorical_non_image_features, categorical_features)
def test_predict_ensemble(batch_size: int) -> None: config_returns_0 = ConstantScalarConfig(0.) model_and_info_returns_0 = ModelAndInfo(config=config_returns_0, model_execution_mode=ModelExecutionMode.TEST, is_mean_teacher=False, checkpoint_path=None) model_loaded = model_and_info_returns_0.try_create_model_load_from_checkpoint_and_adjust() assert model_loaded model_returns_0 = model_and_info_returns_0.model config_returns_1 = ConstantScalarConfig(1.) model_and_info_returns_1 = ModelAndInfo(config=config_returns_1, model_execution_mode=ModelExecutionMode.TEST, is_mean_teacher=False, checkpoint_path=None) model_loaded = model_and_info_returns_1.try_create_model_load_from_checkpoint_and_adjust() assert model_loaded model_returns_1 = model_and_info_returns_1.model pipeline_0 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0, 0) pipeline_1 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0, 1) pipeline_2 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0, 2) pipeline_3 = ScalarInferencePipeline(model_returns_1, config_returns_1, 0, 3) pipeline_4 = ScalarInferencePipeline(model_returns_1, config_returns_1, 0, 4) ensemble_pipeline = ScalarEnsemblePipeline([pipeline_0, pipeline_1, pipeline_2, pipeline_3, pipeline_4], config_returns_0, EnsembleAggregationType.Average) data = {"metadata": [GeneralSampleMetadata(id='2')] * batch_size, "label": torch.zeros((batch_size, 1)), "images": torch.zeros(((batch_size, 1) + config_returns_0.expected_image_size_zyx)), "numerical_non_image_features": torch.tensor([]), "categorical_non_image_features": torch.tensor([]), "segmentations": torch.tensor([])} results = ensemble_pipeline.predict(data) ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs assert ids == ['2'] * batch_size assert torch.equal(labels, torch.zeros((batch_size, 1))) # 3 models return 0, 2 return 1, so predicted should be ((sigmoid(0)*3)+(sigmoid(1)*2))/5 assert torch.allclose(predicted, torch.full((batch_size, 1), 0.592423431))
def test_predict_non_ensemble(batch_size: int, empty_labels: bool) -> None: config = ConstantScalarConfig(1.) model_and_info = ModelAndInfo(config=config, model_execution_mode=ModelExecutionMode.TEST, checkpoint_path=None) model_loaded = model_and_info.try_create_model_load_from_checkpoint_and_adjust( ) assert model_loaded model = model_and_info.model pipeline = ScalarInferencePipeline(model, config, 0, 0) actual_labels = torch.zeros( (batch_size, 1)) * np.nan if empty_labels else torch.zeros( (batch_size, 1)) data = { "metadata": [GeneralSampleMetadata(id='2')] * batch_size, "label": actual_labels, "images": torch.zeros( ((batch_size, 1) + config.expected_image_size_zyx)), "numerical_non_image_features": torch.tensor([]), "categorical_non_image_features": torch.tensor([]), "segmentations": torch.tensor([]) } results = pipeline.predict(data) ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs assert ids == ['2'] * batch_size assert torch.allclose(labels, actual_labels, equal_nan=True) # The model always returns 1, so predicted should be sigmoid(1) assert torch.allclose(predicted, torch.full((batch_size, 1), 0.731058578))
def test_predict_non_ensemble(batch_size: int, empty_labels: bool) -> None: config = ConstantScalarConfig(1.) model = create_lightning_model(config, set_optimizer_and_scheduler=False) assert isinstance(model, ScalarLightning) pipeline = ScalarInferencePipeline(model, config, 0) actual_labels = torch.zeros( (batch_size, 1)) * np.nan if empty_labels else torch.zeros( (batch_size, 1)) data = { "metadata": [GeneralSampleMetadata(id='2')] * batch_size, "label": actual_labels, "images": torch.zeros( ((batch_size, 1) + config.expected_image_size_zyx)), "numerical_non_image_features": torch.tensor([]), "categorical_non_image_features": torch.tensor([]), "segmentations": torch.tensor([]) } results = pipeline.predict(data) ids, labels, predicted = results.subject_ids, results.labels, results.posteriors assert ids == ['2'] * batch_size assert torch.allclose(labels, actual_labels, equal_nan=True) # The model always returns 1, so predicted should be sigmoid(1) assert torch.allclose(predicted, torch.full((batch_size, 1), 0.731058578))
def _create(features: List) -> SequenceDataSource: return SequenceDataSource( metadata=GeneralSampleMetadata(id="foo"), channel_files=[], label=torch.tensor([]), categorical_non_image_features=torch.tensor([]), numerical_non_image_features=torch.tensor(features).float())
def test_predict_non_ensemble(batch_size: int, empty_labels: bool) -> None: config = ClassificationModelForTesting() model: Any = ScalarOnesModel(config.expected_image_size_zyx, 1.) update_model_for_multiple_gpus(ModelAndInfo(model), args=config, execution_mode=ModelExecutionMode.TEST) pipeline = ScalarInferencePipeline(model, config, 0, 0) actual_labels = torch.zeros( (batch_size, 1)) * np.nan if empty_labels else torch.zeros( (batch_size, 1)) data = { "metadata": [GeneralSampleMetadata(id='2')] * batch_size, "label": actual_labels, "images": torch.zeros( ((batch_size, 1) + config.expected_image_size_zyx)), "numerical_non_image_features": torch.tensor([]), "categorical_non_image_features": torch.tensor([]), "segmentations": torch.tensor([]) } results = pipeline.predict(data) ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs assert ids == ['2'] * batch_size assert torch.allclose(labels, actual_labels, equal_nan=True) # The model always returns 1, so predicted should be sigmoid(1) assert torch.allclose(predicted, torch.full((batch_size, 1), 0.731058578))
def _create(pos: int) -> ScalarDataSource: z = torch.empty(0) return ScalarDataSource(metadata=GeneralSampleMetadata( id="", sequence_position=pos), categorical_non_image_features=z, label=z, numerical_non_image_features=z, channel_files=[])
def _create(features: List) -> torch.Tensor: return ScalarItem( segmentations=torch.empty(0), metadata=GeneralSampleMetadata(id="foo"), images=torch.tensor([]), label=torch.tensor([]), categorical_non_image_features=torch.tensor(features).float(), numerical_non_image_features=torch.tensor( features).float()).get_all_non_imaging_features()
def _create(id: str, sequence_position: int, file: Optional[str], metadata: str) -> SequenceDataSource: return SequenceDataSource( channel_files=[file], numerical_non_image_features=torch.tensor([]), categorical_non_image_features=torch.tensor([]), label=torch.tensor([]), metadata=GeneralSampleMetadata(id=id, sequence_position=sequence_position, props={"M": metadata}))
def test_item_is_valid(channel_files: List[Optional[str]], numerical_features: torch.Tensor, categorical_features: torch.Tensor, is_valid: bool) -> None: c = ScalarDataSource(channel_files=channel_files, numerical_non_image_features=numerical_features, categorical_non_image_features=categorical_features, label=torch.empty(0), metadata=GeneralSampleMetadata(id="foo")) assert c.is_valid() == is_valid
def test_standardize_features() -> None: """ Test if the non-image feature can be normalized to mean 0, std 1. :return: """ set_random_seed(1234) expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]]) expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]]) feature_size = (2, 3) sequences: List[ClassificationItemSequence] = [] for s in range(1000): items = [] seq_length = torch.randint(low=3, high=6, size=(1, )).item() for i in range(seq_length): # type: ignore # All features are random Gaussian, apart from feature 0 which is constant. # Normalization must be able to deal with constant features when dividing by standard deviation. features = torch.randn(size=feature_size, dtype=torch.float32 ) * expected_std + expected_mean # Randomly put some infinite values in the vector features[s % 2, s % 3] = np.inf if torch.rand(1) > 0.9 else features[s % 2, s % 3] features[0, 0] = expected_mean[0, 0] item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"), numerical_non_image_features=features, categorical_non_image_features=features, label=torch.tensor([]), images=torch.tensor([]), segmentations=torch.tensor([])) items.append(item) sequences.append(ClassificationItemSequence(id="foo", items=items)) mean_std = FeatureStatistics.from_data_sources(sequences) assert mean_std.mean.shape == feature_size assert mean_std.std.shape == feature_size assert_tensors_equal(mean_std.mean, expected_mean, 0.07) assert_tensors_equal(mean_std.std, expected_std, 0.07) # After normalization, mean should be 0, and std should be 1. standardized_seq = mean_std.standardize(sequences) mean_std_from_standardized = FeatureStatistics.from_data_sources( standardized_seq) # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched, # hence its mean is the original feature value. expected_mean_from_standardized = torch.zeros(feature_size) expected_mean_from_standardized[0, 0] = expected_mean[0, 0] expected_std_from_standardized = torch.ones(feature_size) expected_std_from_standardized[0, 0] = 0.0 assert_tensors_equal(mean_std_from_standardized.mean, expected_mean_from_standardized, abs=1e-5) assert_tensors_equal(mean_std_from_standardized.std, expected_std_from_standardized, abs=1e-5)
def _create_scalar_items(length: int, label_value: float = 1.0) -> List[ScalarItem]: return [ ScalarItem(metadata=GeneralSampleMetadata(id="foo", sequence_position=x), numerical_non_image_features=torch.tensor([]), categorical_non_image_features=torch.tensor([]), label=torch.tensor([label_value]), images=torch.tensor([]), segmentations=torch.tensor([])) for x in range(length) ]
def _create_item(id: str, sequence_position: int, metadata: str, label: Optional[float] = None) -> SequenceDataSource: return SequenceDataSource( channel_files=["foo"], numerical_non_image_features=torch.tensor([]), categorical_non_image_features=torch.tensor([]), label=(torch.tensor([label]) if label else torch.tensor([])), metadata=GeneralSampleMetadata(id=id, sequence_position=sequence_position, props={"M": metadata}))
def test_predict_ensemble(batch_size: int) -> None: config_returns_0 = ConstantScalarConfig(0.) model_returns_0 = create_lightning_model(config_returns_0, set_optimizer_and_scheduler=False) assert isinstance(model_returns_0, ScalarLightning) config_returns_1 = ConstantScalarConfig(1.) model_returns_1 = create_lightning_model(config_returns_1, set_optimizer_and_scheduler=False) assert isinstance(model_returns_1, ScalarLightning) pipeline_0 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0) pipeline_1 = ScalarInferencePipeline(model_returns_0, config_returns_0, 1) pipeline_2 = ScalarInferencePipeline(model_returns_0, config_returns_0, 2) pipeline_3 = ScalarInferencePipeline(model_returns_1, config_returns_1, 3) pipeline_4 = ScalarInferencePipeline(model_returns_1, config_returns_1, 4) ensemble_pipeline = ScalarEnsemblePipeline( [pipeline_0, pipeline_1, pipeline_2, pipeline_3, pipeline_4], config_returns_0, EnsembleAggregationType.Average) data = { "metadata": [GeneralSampleMetadata(id='2')] * batch_size, "label": torch.zeros((batch_size, 1)), "images": torch.zeros( ((batch_size, 1) + config_returns_0.expected_image_size_zyx)), "numerical_non_image_features": torch.tensor([]), "categorical_non_image_features": torch.tensor([]), "segmentations": torch.tensor([]) } results = ensemble_pipeline.predict(data) ids, labels, predicted = results.subject_ids, results.labels, results.posteriors assert ids == ['2'] * batch_size assert torch.equal(labels, torch.zeros((batch_size, 1))) # 3 models return 0, 2 return 1, so predicted should be ((sigmoid(0)*3)+(sigmoid(1)*2))/5 assert torch.allclose(predicted, torch.full((batch_size, 1), 0.592423431))
def test_multi_segmentation_encoder() -> None: scan_size = (25, 33, 65) batch_size = 3 num_image_channels = 2 encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels, encode_channels_jointly=True) x = torch.ones((batch_size, num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES) + scan_size) y = encoder.encode_and_aggregate(x) final_output_channels = _expected_output_channels( num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES) assert y.size() == (batch_size, final_output_channels, 1, 1, 1) full_output = encoder(x) assert full_output.size() == (batch_size, 1) encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels, encode_channels_jointly=False) x = torch.ones((batch_size, num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES) + scan_size) y = encoder.encode_and_aggregate(x) final_output_channels = _expected_output_channels( HDF5_NUM_SEGMENTATION_CLASSES) # Each image channel generates 7 features, we concatenate those 7 features for the 2 image channels assert y.size() == (batch_size, final_output_channels * 2, 1, 1, 1) full_output = encoder(x) assert full_output.size() == (batch_size, 1) # Test that the encoder can correctly convert from a scalar data item to the one-hot encoded model input tensor scalar_item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"), label=torch.empty(1), numerical_non_image_features=torch.empty(1), categorical_non_image_features=torch.empty(1), images=torch.empty(1), segmentations=torch.ones( (batch_size, num_image_channels, *scan_size))) input_tensors = encoder.get_input_tensors(scalar_item) assert len(input_tensors) == 1 assert input_tensors[0].shape == (batch_size, HDF5_NUM_SEGMENTATION_CLASSES * num_image_channels, *scan_size)
def test_predict_ensemble(batch_size: int) -> None: config = ClassificationModelForTesting() model_returns_0: Any = ScalarOnesModel(config.expected_image_size_zyx, 0.) model_returns_1: Any = ScalarOnesModel(config.expected_image_size_zyx, 1.) model_and_opt_0 = update_model_for_multiple_gpus( ModelAndInfo(model_returns_0), args=config, execution_mode=ModelExecutionMode.TEST) model_returns_0 = model_and_opt_0.model model_and_opt_1 = update_model_for_multiple_gpus( ModelAndInfo(model_returns_1), args=config, execution_mode=ModelExecutionMode.TEST) model_returns_1 = model_and_opt_1.model pipeline_0 = ScalarInferencePipeline(model_returns_0, config, 0, 0) pipeline_1 = ScalarInferencePipeline(model_returns_0, config, 0, 1) pipeline_2 = ScalarInferencePipeline(model_returns_0, config, 0, 2) pipeline_3 = ScalarInferencePipeline(model_returns_1, config, 0, 3) pipeline_4 = ScalarInferencePipeline(model_returns_1, config, 0, 4) ensemble_pipeline = ScalarEnsemblePipeline( [pipeline_0, pipeline_1, pipeline_2, pipeline_3, pipeline_4], config, EnsembleAggregationType.Average) data = { "metadata": [GeneralSampleMetadata(id='2')] * batch_size, "label": torch.zeros((batch_size, 1)), "images": torch.zeros( ((batch_size, 1) + config.expected_image_size_zyx)), "numerical_non_image_features": torch.tensor([]), "categorical_non_image_features": torch.tensor([]), "segmentations": torch.tensor([]) } results = ensemble_pipeline.predict(data) ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs assert ids == ['2'] * batch_size assert torch.equal(labels, torch.zeros((batch_size, 1))) # 3 models return 0, 2 return 1, so predicted should be ((sigmoid(0)*3)+(sigmoid(1)*2))/5 assert torch.allclose(predicted, torch.full((batch_size, 1), 0.592423431))
def load_single_data_source(subject_rows: pd.DataFrame, subject_id: str, label_value_column: str, channel_column: str, image_channels: Optional[List[str]] = None, image_file_column: Optional[str] = None, label_channels: Optional[List[str]] = None, transform_labels: Union[Callable, List[Callable]] = LabelTransformation.identity, non_image_feature_channels: Optional[Dict] = None, numerical_columns: Optional[List[str]] = None, categorical_data_encoder: Optional[CategoricalToOneHotEncoder] = None, metadata_columns: Optional[Set[str]] = None, is_classification_dataset: bool = True, sequence_position_numeric: Optional[int] = None) -> T: """ Converts a set of dataset rows for a single subject to a ScalarDataSource instance, which contains the labels, the non-image features, and the paths to the image files. :param channel_column: The name of the column that contains the row identifier ("channels") :param metadata_columns: A list of columns that well be added to the item metadata as key/value pairs. :param subject_rows: All dataset rows that belong to the same subject. :param subject_id: The identifier of the subject that is being processed. :param image_channels: The names of all channels (stored in the CSV_CHANNEL_HEADER column of the dataframe) that are expected to be loaded from disk later because they are large images. :param image_file_column: The name of the column that contains the image file names. :param label_channels: The name of the channel where the label scalar or vector is read from. :param label_value_column: The column that contains the value for the label scalar or vector. :param non_image_feature_channels: non_image_feature_channels: A dictonary of the names of all channels where additional scalar values should be read from. THe keys should map each feature to its channels. :param numerical_columns: The names of all columns where additional scalar values should be read from. :param categorical_data_encoder: Encoding scheme for categorical data. :param is_classification_dataset: If the current dataset is classification or not. from. :param transform_labels: a label transformation or a list of label transformation to apply to the labels. If a list is provided, the transformations are applied in order from left to right. :param sequence_position_numeric: Numeric position of the data source in a data sequence. Assumed to be a non-sequential dataset item if None provided (default). :return: """ def _get_row_for_channel(channel: Optional[str]) -> Dict[str, str]: return _get_single_channel_row(subject_rows, channel, subject_id, channel_column) def _get_label_as_tensor(channel: Optional[str]) -> torch.Tensor: extract_fn = extract_label_classification if is_classification_dataset else extract_label_regression label_row = _get_row_for_channel(channel) label_string = label_row[label_value_column] return torch.tensor([extract_fn(label_string=label_string, sample_id=subject_id)], dtype=torch.float) def _apply_label_transforms(labels: Any) -> Any: """ Apply the transformations in order. """ if isinstance(transform_labels, List): for transform in transform_labels: labels = transform(labels) label = labels else: label = transform_labels(labels) return label def create_none_list(x: Optional[List]) -> List: return [None] if x is None or len(x) == 0 else x def get_none_list_from_dict(non_image_channels: Dict[str, List[str]], feature: str) -> Sequence[Optional[str]]: """ Return either the list of channels for a given column or if None was passed as numerical channels i.e. there are no channel to be specified return [None]. :param non_image_channels: Dict mapping features name to their channels :param feature: feature name for which to return the channels :return: List of channels for the given feature. """ if non_image_channels == {}: return [None] else: return non_image_channels[feature] def is_empty(x: Optional[List]) -> bool: return x is None or len(x) == 0 def none_if_missing_in_csv(x: Any) -> Optional[str]: # If the CSV contains missing values they turn into NaN here, but mark them as None rather. return None if isinstance(x, float) and np.isnan(x) else x subject_rows = subject_rows.fillna('') labels = [] if label_channels: for channel in label_channels: labels.append(_get_label_as_tensor(channel)) else: labels.append(_get_label_as_tensor(None)) label = _apply_label_transforms(labels) channel_for_metadata = label_channels[0] if label_channels else None label_row = _get_row_for_channel(channel_for_metadata) metadata = GeneralSampleMetadata(id=subject_id, props={key: none_if_missing_in_csv(label_row[key]) for key in metadata_columns or set()}) image_files = [] if image_file_column: for image_channel in create_none_list(image_channels): # Alternative: restrict rows to given channels first, then read out the relevant columns. file_path = _get_row_for_channel(image_channel)[image_file_column] image_files.append(none_if_missing_in_csv(file_path)) numerical_columns = numerical_columns or [] categorical_columns = categorical_data_encoder.get_supported_dataset_column_names() if categorical_data_encoder \ else [] _feature_columns = numerical_columns + categorical_columns if not non_image_feature_channels: non_image_feature_channels = {} numerical = [] categorical = {} if not is_empty(_feature_columns): for column in _feature_columns: list_channels: Sequence[Optional[str]] = [str(sequence_position_numeric)] \ if sequence_position_numeric is not None else get_none_list_from_dict(non_image_feature_channels, column) numerical_col, categorical_col = [], [] for channel in list_channels: # type: ignore row = _get_row_for_channel(channel) prefix = f"Channel {channel}, column {column}" if column in numerical_columns: numerical_col.append(_string_to_float(row[column], error_message_prefix=prefix)) else: categorical_col.append(row[column]) if column in numerical_columns: numerical.extend(numerical_col) else: categorical[column] = categorical_col categorical_non_image_features = categorical_data_encoder.encode(categorical) \ if categorical_data_encoder else torch.tensor(list(categorical.values())) datasource: Union[SequenceDataSource, ScalarDataSource] if sequence_position_numeric is not None: metadata.sequence_position = sequence_position_numeric datasource = SequenceDataSource( label=label, channel_files=image_files, numerical_non_image_features=torch.tensor(numerical).float(), categorical_non_image_features=categorical_non_image_features.float(), metadata=metadata ) return datasource # type: ignore datasource = ScalarDataSource( label=label, channel_files=image_files, numerical_non_image_features=torch.tensor(numerical).float(), categorical_non_image_features=categorical_non_image_features.float(), metadata=metadata ) return datasource # type: ignore