Exemple #1
0
 def _create(features: List) -> SequenceDataSource:
     return SequenceDataSource(
         metadata=GeneralSampleMetadata(id="foo"),
         channel_files=[],
         label=torch.tensor([]),
         categorical_non_image_features=torch.tensor([]),
         numerical_non_image_features=torch.tensor(features).float())
Exemple #2
0
 def _create(id: str, sequence_position: int, file: Optional[str],
             metadata: str) -> SequenceDataSource:
     return SequenceDataSource(
         channel_files=[file],
         numerical_non_image_features=torch.tensor([]),
         categorical_non_image_features=torch.tensor([]),
         label=torch.tensor([]),
         metadata=GeneralSampleMetadata(id=id,
                                        sequence_position=sequence_position,
                                        props={"M": metadata}))
Exemple #3
0
def _create_item(id: str,
                 sequence_position: int,
                 metadata: str,
                 label: Optional[float] = None) -> SequenceDataSource:
    return SequenceDataSource(
        channel_files=["foo"],
        numerical_non_image_features=torch.tensor([]),
        categorical_non_image_features=torch.tensor([]),
        label=(torch.tensor([label]) if label else torch.tensor([])),
        metadata=GeneralSampleMetadata(id=id,
                                       sequence_position=sequence_position,
                                       props={"M": metadata}))
Exemple #4
0
def test_standardize_features_when_singleton(is_sequence: bool) -> None:
    """
    Test how feature standardize copes with datasets that only have 1 entry.
    """
    numerical_features = torch.ones((1, 3))
    categorical_features = torch.tensor([[0, 1, 1], [1, 0, 0]])
    item: Union[SequenceDataSource, ScalarDataSource]
    sources: Union[ListOfSequences, List[ScalarDataSource]]
    if is_sequence:
        item = SequenceDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])
        sources = [ClassificationItemSequence(id="foo", items=[item])]
        mean_std = FeatureStatistics.from_data_sources(sources)
    else:
        item = ScalarDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])

        sources = [item]
        mean_std = FeatureStatistics.from_data_sources(sources)

    assert_tensors_equal(mean_std.mean, numerical_features)
    # Standard deviation can't be computed because there is only one element, hence becomes nan.
    assert torch.all(torch.isnan(mean_std.std))
    # When applying such a standardization to the sequences, they should not be changed (similar to features that
    # are constant)
    standardized_sources = mean_std.standardize(sources)
    if is_sequence:
        assert_tensors_equal(
            standardized_sources[0].items[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].items[0].categorical_non_image_features,
            categorical_features)
    else:
        assert_tensors_equal(
            standardized_sources[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].categorical_non_image_features,
            categorical_features)
Exemple #5
0
def load_single_data_source(subject_rows: pd.DataFrame,
                            subject_id: str,
                            label_value_column: str,
                            channel_column: str,
                            image_channels: Optional[List[str]] = None,
                            image_file_column: Optional[str] = None,
                            label_channels: Optional[List[str]] = None,
                            transform_labels: Union[Callable, List[Callable]] = LabelTransformation.identity,
                            non_image_feature_channels: Optional[Dict] = None,
                            numerical_columns: Optional[List[str]] = None,
                            categorical_data_encoder: Optional[CategoricalToOneHotEncoder] = None,
                            metadata_columns: Optional[Set[str]] = None,
                            is_classification_dataset: bool = True,
                            sequence_position_numeric: Optional[int] = None) -> T:
    """
    Converts a set of dataset rows for a single subject to a ScalarDataSource instance, which contains the
    labels, the non-image features, and the paths to the image files.
    :param channel_column: The name of the column that contains the row identifier ("channels")
    :param metadata_columns: A list of columns that well be added to the item metadata as key/value pairs.
    :param subject_rows: All dataset rows that belong to the same subject.
    :param subject_id: The identifier of the subject that is being processed.
    :param image_channels: The names of all channels (stored in the CSV_CHANNEL_HEADER column of the dataframe)
    that are expected to be loaded from disk later because they are large images.
    :param image_file_column: The name of the column that contains the image file names.
    :param label_channels: The name of the channel where the label scalar or vector is read from.
    :param label_value_column: The column that contains the value for the label scalar or vector.
    :param non_image_feature_channels: non_image_feature_channels: A dictonary of the names of all channels where
    additional scalar values should be read from. THe keys should map each feature to its channels.
    :param numerical_columns: The names of all columns where additional scalar values should be read from.
    :param categorical_data_encoder: Encoding scheme for categorical data.
    :param is_classification_dataset: If the current dataset is classification or not.
    from.
    :param transform_labels: a label transformation or a list of label transformation to apply to the labels.
    If a list is provided, the transformations are applied in order from left to right.
    :param sequence_position_numeric: Numeric position of the data source in a data sequence. Assumed to be
    a non-sequential dataset item if None provided (default).
    :return:
    """

    def _get_row_for_channel(channel: Optional[str]) -> Dict[str, str]:
        return _get_single_channel_row(subject_rows, channel, subject_id, channel_column)

    def _get_label_as_tensor(channel: Optional[str]) -> torch.Tensor:
        extract_fn = extract_label_classification if is_classification_dataset else extract_label_regression
        label_row = _get_row_for_channel(channel)
        label_string = label_row[label_value_column]
        return torch.tensor([extract_fn(label_string=label_string, sample_id=subject_id)],
                            dtype=torch.float)

    def _apply_label_transforms(labels: Any) -> Any:
        """
        Apply the transformations in order.
        """
        if isinstance(transform_labels, List):
            for transform in transform_labels:
                labels = transform(labels)
            label = labels
        else:
            label = transform_labels(labels)
        return label

    def create_none_list(x: Optional[List]) -> List:
        return [None] if x is None or len(x) == 0 else x

    def get_none_list_from_dict(non_image_channels: Dict[str, List[str]], feature: str) -> Sequence[Optional[str]]:
        """
        Return either the list of channels for a given column or if None was passed as
        numerical channels i.e. there are no channel to be specified return [None].
        :param non_image_channels: Dict mapping features name to their channels
        :param feature: feature name for which to return the channels
        :return: List of channels for the given feature.
        """
        if non_image_channels == {}:
            return [None]
        else:
            return non_image_channels[feature]

    def is_empty(x: Optional[List]) -> bool:
        return x is None or len(x) == 0

    def none_if_missing_in_csv(x: Any) -> Optional[str]:
        # If the CSV contains missing values they turn into NaN here, but mark them as None rather.
        return None if isinstance(x, float) and np.isnan(x) else x

    subject_rows = subject_rows.fillna('')
    labels = []
    if label_channels:
        for channel in label_channels:
            labels.append(_get_label_as_tensor(channel))
    else:
        labels.append(_get_label_as_tensor(None))

    label = _apply_label_transforms(labels)

    channel_for_metadata = label_channels[0] if label_channels else None
    label_row = _get_row_for_channel(channel_for_metadata)
    metadata = GeneralSampleMetadata(id=subject_id, props={key: none_if_missing_in_csv(label_row[key])
                                                           for key in metadata_columns or set()})

    image_files = []
    if image_file_column:
        for image_channel in create_none_list(image_channels):
            # Alternative: restrict rows to given channels first, then read out the relevant columns.
            file_path = _get_row_for_channel(image_channel)[image_file_column]
            image_files.append(none_if_missing_in_csv(file_path))

    numerical_columns = numerical_columns or []
    categorical_columns = categorical_data_encoder.get_supported_dataset_column_names() if categorical_data_encoder \
        else []
    _feature_columns = numerical_columns + categorical_columns

    if not non_image_feature_channels:
        non_image_feature_channels = {}

    numerical = []
    categorical = {}
    if not is_empty(_feature_columns):
        for column in _feature_columns:
            list_channels: Sequence[Optional[str]] = [str(sequence_position_numeric)] \
                if sequence_position_numeric is not None else get_none_list_from_dict(non_image_feature_channels,
                                                                                      column)
            numerical_col, categorical_col = [], []
            for channel in list_channels:  # type: ignore
                row = _get_row_for_channel(channel)
                prefix = f"Channel {channel}, column {column}"
                if column in numerical_columns:
                    numerical_col.append(_string_to_float(row[column], error_message_prefix=prefix))
                else:
                    categorical_col.append(row[column])
            if column in numerical_columns:
                numerical.extend(numerical_col)
            else:
                categorical[column] = categorical_col

    categorical_non_image_features = categorical_data_encoder.encode(categorical) \
        if categorical_data_encoder else torch.tensor(list(categorical.values()))

    datasource: Union[SequenceDataSource, ScalarDataSource]
    if sequence_position_numeric is not None:
        metadata.sequence_position = sequence_position_numeric
        datasource = SequenceDataSource(
            label=label,
            channel_files=image_files,
            numerical_non_image_features=torch.tensor(numerical).float(),
            categorical_non_image_features=categorical_non_image_features.float(),
            metadata=metadata
        )
        return datasource  # type: ignore

    datasource = ScalarDataSource(
        label=label,
        channel_files=image_files,
        numerical_non_image_features=torch.tensor(numerical).float(),
        categorical_non_image_features=categorical_non_image_features.float(),
        metadata=metadata
    )
    return datasource  # type: ignore