def test_patient_metadata() -> None: """ Loading a dataset where all patient metadata columns are present :return: """ file = full_ml_test_data_path("dataset_with_full_header.csv") df = pd.read_csv(file, dtype=str) subject = "511" expected_institution = "85aaee5f-f5f3-4eae-b6cd-26b0070156d8" expected_series = "22ef9c5e149650f9cb241d1aa622ad1731b91d1a1df770c05541228b47845ae4" expected_tags = "FOO;BAR" metadata = PatientMetadata.from_dataframe(df, subject) assert metadata is not None assert metadata.patient_id == subject assert metadata.institution == expected_institution assert metadata.series == expected_series assert metadata.tags_str == expected_tags # Now modify the dataset such that there is no single value for tags. Tags should no longer be # populated, but the other fields should be. df['tags'] = ["something", ""] metadata = PatientMetadata.from_dataframe(df, subject) assert metadata.series == expected_series assert metadata.institution == expected_institution assert metadata.tags_str is None
def test_min_patient_metadata() -> None: """ Loading a dataset where only required columns are present """ df = pd.read_csv(full_ml_test_data_path("dataset.csv"), dtype=str) df = df.drop(columns="institutionId") patient_id = "1" metadata = PatientMetadata.from_dataframe(df, patient_id) assert metadata.patient_id == patient_id assert metadata.series is None assert metadata.institution is None assert metadata.tags_str is None
def test_get_all_metadata(default_config: ModelConfigBase) -> None: df = default_config.get_dataset_splits().train assert PatientMetadata.from_dataframe(df, '1') == PatientMetadata( patient_id='1', institution="1") assert PatientMetadata.from_dataframe(df, '2') == PatientMetadata( patient_id='2', institution="2")
def load_dataset_sources( dataframe: pd.DataFrame, local_dataset_root_folder: Path, image_channels: List[str], ground_truth_channels: List[str], mask_channel: Optional[str], allow_incomplete_labels: bool = False ) -> Dict[str, PatientDatasetSource]: """ Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file. The dataframe contains per-patient per-channel image information, relative to a root directory. This method converts that into a per-patient dictionary, that contains absolute file paths separated for for image channels, ground truth channels, and mask channels. :param dataframe: A dataframe read directly from a dataset CSV file. :param local_dataset_root_folder: The root folder that contains all images. :param image_channels: The names of the image channels that should be used in the result. :param ground_truth_channels: The names of the ground truth channels that should be used in the result. :param mask_channel: The name of the mask channel that should be used in the result. This can be None. :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided. If true, ground truth files are optional. Default value is false. :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource. """ expected_headers = { CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER } # validate the csv file actual_headers = list(dataframe) if not expected_headers.issubset(actual_headers): raise ValueError( "The dataset CSV file should contain at least these columns: {}, but got: {}" .format(expected_headers, actual_headers)) # Calculate unique data points, first, and last data point unique_ids: List[str] = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER])) if not local_dataset_root_folder.is_dir(): raise ValueError("The dataset root folder does not exist: {}".format( local_dataset_root_folder)) def get_mask_channel_or_default() -> Optional[Path]: if mask_channel is None: return None paths = get_paths_for_channel_ids( channels=[mask_channel], allow_incomplete_labels_flag=allow_incomplete_labels) if len(paths) == 0: return None else: return paths[0] def get_paths_for_channel_ids( channels: List[str], allow_incomplete_labels_flag: bool) -> List[Optional[Path]]: if len(set(channels)) < len(channels): raise ValueError(f"ids have duplicated entries: {channels}") rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id] # converts channels to paths and makes second sanity check for channel data paths, failed_channel_info = convert_channels_to_file_paths( channels, rows, local_dataset_root_folder, patient_id, allow_incomplete_labels_flag) if failed_channel_info: raise ValueError(failed_channel_info) return paths dataset_sources = {} for patient_id in unique_ids: metadata = PatientMetadata.from_dataframe(dataframe, patient_id) dataset_sources[patient_id] = PatientDatasetSource( metadata=metadata, image_channels=get_paths_for_channel_ids( channels=image_channels, # type: ignore allow_incomplete_labels_flag=False), mask_channel=get_mask_channel_or_default(), ground_truth_channels=get_paths_for_channel_ids( channels=ground_truth_channels, # type: ignore allow_incomplete_labels_flag=allow_incomplete_labels), allow_incomplete_labels=allow_incomplete_labels) return dataset_sources
def load_dataset_sources( dataframe: pd.DataFrame, local_dataset_root_folder: Path, image_channels: List[str], ground_truth_channels: List[str], mask_channel: Optional[str]) -> Dict[int, PatientDatasetSource]: """ Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file. The dataframe contains per-patient per-channel image information, relative to a root directory. This method converts that into a per-patient dictionary, that contains absolute file paths separated for for image channels, ground truth channels, and mask channels. :param dataframe: A dataframe read directly from a dataset CSV file. :param local_dataset_root_folder: The root folder that contains all images. :param image_channels: The names of the image channels that should be used in the result. :param ground_truth_channels: The names of the ground truth channels that should be used in the result. :param mask_channel: The name of the mask channel that should be used in the result. This can be None. :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource. """ expected_headers = { CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER } # validate the csv file actual_headers = list(dataframe) if not expected_headers.issubset(actual_headers): raise ValueError( "The dataset CSV file should contain at least these columns: {}, but got: {}" .format(expected_headers, actual_headers)) # Calculate unique data points, first, and last data point unique_ids = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER])) if not local_dataset_root_folder.is_dir(): raise ValueError("The dataset root folder does not exist: {}".format( local_dataset_root_folder)) def get_mask_channel_or_default() -> Optional[Path]: if mask_channel is None: return None else: return get_paths_for_channel_ids(channels=[mask_channel])[0] def get_paths_for_channel_ids(channels: List[str]) -> List[Path]: if len(set(channels)) < len(channels): raise ValueError(f"ids have duplicated entries: {channels}") paths: List[Path] = [] rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id] for channel_id in channels: row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id] if len(row) == 0: raise ValueError( f"Patient {patient_id} does not have channel '{channel_id}'" ) elif len(row) > 1: raise ValueError( f"Patient {patient_id} has more than one entry for channel '{channel_id}'" ) image_path = local_dataset_root_folder / row[ CSV_PATH_HEADER].values[0] if not image_path.is_file(): raise ValueError( f"The dataset references a file that does not exist: {image_path}" ) paths.append(image_path) return paths dataset_sources = {} for patient_id in unique_ids: metadata = PatientMetadata.from_dataframe(dataframe, patient_id) dataset_sources[patient_id] = PatientDatasetSource( metadata=metadata, image_channels=get_paths_for_channel_ids( channels=image_channels), # type: ignore mask_channel=get_mask_channel_or_default(), ground_truth_channels=get_paths_for_channel_ids( channels=ground_truth_channels) # type: ignore ) return dataset_sources