コード例 #1
0
def test_patient_metadata() -> None:
    """
    Loading a dataset where all patient metadata columns are present
    :return:
    """
    file = full_ml_test_data_path("dataset_with_full_header.csv")
    df = pd.read_csv(file, dtype=str)
    subject = "511"
    expected_institution = "85aaee5f-f5f3-4eae-b6cd-26b0070156d8"
    expected_series = "22ef9c5e149650f9cb241d1aa622ad1731b91d1a1df770c05541228b47845ae4"
    expected_tags = "FOO;BAR"
    metadata = PatientMetadata.from_dataframe(df, subject)
    assert metadata is not None
    assert metadata.patient_id == subject
    assert metadata.institution == expected_institution
    assert metadata.series == expected_series
    assert metadata.tags_str == expected_tags

    # Now modify the dataset such that there is no single value for tags. Tags should no longer be
    # populated, but the other fields should be.
    df['tags'] = ["something", ""]
    metadata = PatientMetadata.from_dataframe(df, subject)
    assert metadata.series == expected_series
    assert metadata.institution == expected_institution
    assert metadata.tags_str is None
コード例 #2
0
def test_min_patient_metadata() -> None:
    """
    Loading a dataset where only required columns are present
    """
    df = pd.read_csv(full_ml_test_data_path("dataset.csv"), dtype=str)
    df = df.drop(columns="institutionId")
    patient_id = "1"
    metadata = PatientMetadata.from_dataframe(df, patient_id)
    assert metadata.patient_id == patient_id
    assert metadata.series is None
    assert metadata.institution is None
    assert metadata.tags_str is None
コード例 #3
0
def test_get_all_metadata(default_config: ModelConfigBase) -> None:
    df = default_config.get_dataset_splits().train
    assert PatientMetadata.from_dataframe(df, '1') == PatientMetadata(
        patient_id='1', institution="1")
    assert PatientMetadata.from_dataframe(df, '2') == PatientMetadata(
        patient_id='2', institution="2")
コード例 #4
0
def load_dataset_sources(
        dataframe: pd.DataFrame,
        local_dataset_root_folder: Path,
        image_channels: List[str],
        ground_truth_channels: List[str],
        mask_channel: Optional[str],
        allow_incomplete_labels: bool = False
) -> Dict[str, PatientDatasetSource]:
    """
    Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file.
    The dataframe contains per-patient per-channel image information, relative to a root directory.
    This method converts that into a per-patient dictionary, that contains absolute file paths
    separated for for image channels, ground truth channels, and mask channels.
    :param dataframe: A dataframe read directly from a dataset CSV file.
    :param local_dataset_root_folder: The root folder that contains all images.
    :param image_channels: The names of the image channels that should be used in the result.
    :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
    :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
    :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided. If true, ground
                                    truth files are optional. Default value is false.
    :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
    """
    expected_headers = {
        CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER
    }
    # validate the csv file
    actual_headers = list(dataframe)
    if not expected_headers.issubset(actual_headers):
        raise ValueError(
            "The dataset CSV file should contain at least these columns: {}, but got: {}"
            .format(expected_headers, actual_headers))

    # Calculate unique data points, first, and last data point
    unique_ids: List[str] = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER]))
    if not local_dataset_root_folder.is_dir():
        raise ValueError("The dataset root folder does not exist: {}".format(
            local_dataset_root_folder))

    def get_mask_channel_or_default() -> Optional[Path]:
        if mask_channel is None:
            return None
        paths = get_paths_for_channel_ids(
            channels=[mask_channel],
            allow_incomplete_labels_flag=allow_incomplete_labels)
        if len(paths) == 0:
            return None
        else:
            return paths[0]

    def get_paths_for_channel_ids(
            channels: List[str],
            allow_incomplete_labels_flag: bool) -> List[Optional[Path]]:
        if len(set(channels)) < len(channels):
            raise ValueError(f"ids have duplicated entries: {channels}")
        rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
        # converts channels to paths and makes second sanity check for channel data
        paths, failed_channel_info = convert_channels_to_file_paths(
            channels, rows, local_dataset_root_folder, patient_id,
            allow_incomplete_labels_flag)

        if failed_channel_info:
            raise ValueError(failed_channel_info)

        return paths

    dataset_sources = {}
    for patient_id in unique_ids:
        metadata = PatientMetadata.from_dataframe(dataframe, patient_id)
        dataset_sources[patient_id] = PatientDatasetSource(
            metadata=metadata,
            image_channels=get_paths_for_channel_ids(
                channels=image_channels,  # type: ignore
                allow_incomplete_labels_flag=False),
            mask_channel=get_mask_channel_or_default(),
            ground_truth_channels=get_paths_for_channel_ids(
                channels=ground_truth_channels,  # type: ignore
                allow_incomplete_labels_flag=allow_incomplete_labels),
            allow_incomplete_labels=allow_incomplete_labels)

    return dataset_sources
コード例 #5
0
def load_dataset_sources(
        dataframe: pd.DataFrame, local_dataset_root_folder: Path,
        image_channels: List[str], ground_truth_channels: List[str],
        mask_channel: Optional[str]) -> Dict[int, PatientDatasetSource]:
    """
    Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file.
    The dataframe contains per-patient per-channel image information, relative to a root directory.
    This method converts that into a per-patient dictionary, that contains absolute file paths
    separated for for image channels, ground truth channels, and mask channels.
    :param dataframe: A dataframe read directly from a dataset CSV file.
    :param local_dataset_root_folder: The root folder that contains all images.
    :param image_channels: The names of the image channels that should be used in the result.
    :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
    :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
    :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
    """
    expected_headers = {
        CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER
    }
    # validate the csv file
    actual_headers = list(dataframe)
    if not expected_headers.issubset(actual_headers):
        raise ValueError(
            "The dataset CSV file should contain at least these columns: {}, but got: {}"
            .format(expected_headers, actual_headers))

    # Calculate unique data points, first, and last data point
    unique_ids = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER]))
    if not local_dataset_root_folder.is_dir():
        raise ValueError("The dataset root folder does not exist: {}".format(
            local_dataset_root_folder))

    def get_mask_channel_or_default() -> Optional[Path]:
        if mask_channel is None:
            return None
        else:
            return get_paths_for_channel_ids(channels=[mask_channel])[0]

    def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
        if len(set(channels)) < len(channels):
            raise ValueError(f"ids have duplicated entries: {channels}")

        paths: List[Path] = []
        rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
        for channel_id in channels:
            row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id]
            if len(row) == 0:
                raise ValueError(
                    f"Patient {patient_id} does not have channel '{channel_id}'"
                )
            elif len(row) > 1:
                raise ValueError(
                    f"Patient {patient_id} has more than one entry for channel '{channel_id}'"
                )
            image_path = local_dataset_root_folder / row[
                CSV_PATH_HEADER].values[0]
            if not image_path.is_file():
                raise ValueError(
                    f"The dataset references a file that does not exist: {image_path}"
                )
            paths.append(image_path)
        return paths

    dataset_sources = {}
    for patient_id in unique_ids:
        metadata = PatientMetadata.from_dataframe(dataframe, patient_id)
        dataset_sources[patient_id] = PatientDatasetSource(
            metadata=metadata,
            image_channels=get_paths_for_channel_ids(
                channels=image_channels),  # type: ignore
            mask_channel=get_mask_channel_or_default(),
            ground_truth_channels=get_paths_for_channel_ids(
                channels=ground_truth_channels)  # type: ignore
        )

    return dataset_sources