def test_replace():
    data = np.arange(5)
    mapping = {0: 10, 1: 20, 2: 30, 3: 40, 4: 30}
    output = volume.replace(data, mapping)
    assert_array_equal(output, [10, 20, 30, 40, 30])

    # Test that overlapping keys and values gives correct result.
    data = np.arange(5)
    mapping = {0: 1, 1: 2, 2: 3, 3: 4}
    output = volume.replace(data, mapping)
    assert_array_equal(output, [1, 2, 3, 4, 4])

    data = np.arange(8).reshape(2, 2, 2)
    mapping = {0: 100, 100: 10, 10: 5, 3: 5}
    outputs = volume.replace(data, mapping, zero=False)
    expected = data.copy()
    expected[0, 0, 0] = 100
    expected[0, 1, 1] = 5
    assert_array_equal(outputs, expected)

    # Zero values not in mapping values.
    outputs = volume.replace(data, mapping, zero=True)
    expected = np.zeros_like(data)
    expected[0, 0, 0] = 100
    expected[0, 1, 1] = 5
    expected[1, 0, 1] = 5
    assert_array_equal(outputs, expected)
def test_replace():
    data = np.arange(5)
    mapping = {
        0: 10,
        1: 20,
        2: 30,
        3: 40,
        4: 30,
    }
    data = replace(data, mapping)
    assert_array_equal(data, [10, 20, 30, 40, 30])

    # Test that overlapping keys and values gives correct result.
    data = np.arange(5)
    mapping = {
        0: 1,
        1: 2,
        2: 3,
        3: 4,
    }
    data = replace(data, mapping)
    assert_array_equal(data, [1, 2, 3, 4, 4])
Exemple #3
0
def get_dataset(
    file_pattern,
    n_classes,
    batch_size,
    volume_shape,
    scalar_label=False,
    block_shape=None,
    n_epochs=None,
    mapping=None,
    augment=False,
    shuffle_buffer_size=None,
    num_parallel_calls=AUTOTUNE,
):
    """Return `tf.data.Dataset` that preprocesses data for training or prediction.

    Labels are preprocessed for binary or multiclass segmentation according to
    `n_classes`.

    Parameters
    ----------
    file_pattern: str, expression that can be globbed to get TFRecords files
        for this dataset. For example 'data/training_*.tfrecords'.
    n_classes: int, number of classes to segment. Values of 1 and 2 indicate
        binary segmentation (foreground vs background), and values greater than
        2 indicate multiclass segmentation.
    batch_size: int, number of elements per batch.
    volume_shape: tuple of length 3, the shape of every volume in the TFRecords
        files. Every volume must have the same shape.
    scalar_label: boolean, if `True`, labels are scalars.
    block_shape: tuple of length 3, the shape of the non-overlapping sub-volumes
        to take from the full volumes. If None, do not separate the full volumes
        into sub-volumes. Separating into non-overlapping sub-volumes is useful
        (sometimes even necessary) to overcome memory limitations depending on
        the number of model parameters.
    n_epochs: int, number of epochs for the dataset to repeat. If None, the
        dataset will be repeated indefinitely.
    mapping: dict, mapping to replace label values. Values equal to a key in
        the mapping are replaced with the corresponding values in the mapping.
        Values not in `mapping.keys()` are replaced with zeros.
    augment: boolean, if true, apply random rigid transformations to the
        features and labels. The rigid transformations are applied to the full
        volumes.
    shuffle_buffer_size: int, buffer of full volumes to shuffle. If this is not
        None, then the list of files found by 'file_pattern' is also shuffled
        at every iteration.
    num_parallel_calls: int, number of parallel calls to make for data loading
        and processing.

    Returns
    -------
    `tf.data.Dataset` of features and labels. If block_shape is not None, the
    shape of features is `(batch_size, *block_shape, 1)` and the shape of labels
    is `(batch_size, *block_shape, n_classes)`. If block_shape is None, then
    the shape of features is `(batch_size, *volume_shape, 1)` and the shape of
    labels is `(batch_size, *volume_shape, n_classes)`. If `scalar_label` is `True,
    the shape of labels is always `(batch_size,)`.
    """

    files = glob.glob(file_pattern)
    if not files:
        raise ValueError(
            "no files found for pattern '{}'".format(file_pattern))

    # Create dataset of all TFRecord files. After this point, the dataset will have
    # two value per iteration: (feature, label).
    shuffle = bool(shuffle_buffer_size)
    compressed = _is_gzipped(files[0])
    dataset = tfrecord_dataset(
        file_pattern=file_pattern,
        volume_shape=volume_shape,
        shuffle=shuffle,
        scalar_label=scalar_label,
        compressed=compressed,
        num_parallel_calls=num_parallel_calls,
    )

    # Standard-score the features.
    dataset = dataset.map(lambda x, y: (standardize(x), y))

    # Separate into blocks, if requested.
    if block_shape is not None:
        if not scalar_label:
            dataset = dataset.map(
                lambda x, y:
                (to_blocks(x, block_shape), to_blocks(y, block_shape)),
                num_parallel_calls=num_parallel_calls,
            )
            # This step is necessary because separating into blocks adds a dimension.
            dataset = dataset.unbatch()
        if scalar_label:

            def _f(x, y):
                x = to_blocks(x, block_shape)
                n_blocks = x.shape[0]
                y = tf.repeat(y, n_blocks)
                return (x, y)

            dataset = dataset.map(_f, num_parallel_calls=num_parallel_calls)
            # This step is necessary because separating into blocks adds a dimension.
            dataset = dataset.unbatch()

    # Augment examples if requested.
    if augment:
        if not scalar_label:
            dataset = dataset.map(
                lambda x, y: tf.cond(
                    tf.random.uniform((1, )) > 0.5,
                    true_fn=lambda: apply_random_transform(x, y),
                    false_fn=lambda: (x, y),
                ),
                num_parallel_calls=num_parallel_calls,
            )
        else:
            dataset = dataset.map(
                lambda x, y: tf.cond(
                    tf.random.uniform((1, )) > 0.5,
                    true_fn=lambda: apply_random_transform_scalar_labels(x, y),
                    false_fn=lambda: (x, y),
                ),
                num_parallel_calls=num_parallel_calls,
            )

    # Binarize or replace labels according to mapping.
    if not scalar_label:
        if n_classes < 1:
            raise ValueError("n_classes must be > 0.")
        elif n_classes == 1:
            dataset = dataset.map(lambda x, y:
                                  (x, tf.expand_dims(binarize(y), -1)))
        elif n_classes == 2:
            dataset = dataset.map(lambda x, y:
                                  (x, tf.one_hot(binarize(y), n_classes)))
        elif n_classes > 2:
            if mapping is not None:
                dataset = dataset.map(lambda x, y:
                                      (x, replace(y, mapping=mapping)))
            dataset = dataset.map(lambda x, y: (x, tf.one_hot(y, n_classes)))

    # Add grayscale channel to features.
    # TODO: in the future, multi-channel features should be supported.
    dataset = dataset.map(lambda x, y: (tf.expand_dims(x, -1), y))

    # Prefetch data to overlap data production with data consumption. The
    # TensorFlow documentation suggests prefetching `batch_size` elements.
    dataset = dataset.prefetch(buffer_size=batch_size)

    # Batch the dataset, so each iteration gives `batch_size` elements. We drop
    # the remainder so that when training on multiple GPUs, the batch will
    # always be evenly divisible by the number of GPUs. Otherwise, the last
    # batch might have fewer than `batch_size` elements and will cause errors.
    if batch_size is not None:
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)

    # Optionally shuffle. We also optionally shuffle the list of files.
    # The TensorFlow recommend shuffling and then repeating.
    if shuffle_buffer_size:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat the dataset for n_epochs. If n_epochs is None, then repeat
    # indefinitely. If n_epochs is 1, then the dataset will only be iterated
    # through once.
    dataset = dataset.repeat(n_epochs)

    return dataset
def validate_from_filepath(
    filepath,
    predictor,
    block_shape,
    n_classes,
    mapping_y,
    return_variance=False,
    return_entropy=False,
    return_array_from_images=False,
    n_samples=1,
    normalizer=normalize_zero_one,
    batch_size=4,
    dtype=DT_X,
):
    """Computes dice for a prediction compared to a ground truth image.

    Args:
        filepath: tuple, tupel of paths to existing neuroimaging volume (index 0)
         and ground truth (index 1).
        predictor: TensorFlow Predictor object, predictor from previously
            trained model.
        n_classes: int, number of classifications the model is trained to output.
        mapping_y: path-like, path to csv mapping file per command line argument.
        block_shape: tuple of len 3, shape of blocks on which to predict.
        return_variance: Boolean. If set True, it returns the running population 
            variance along with mean. Note, if the n_samples is smaller or equal to 1,
            the variance will not be returned; instead it will return None
        return_entropy: Boolean. If set True, it returns the running entropy.
            along with mean.       
        return_array_from_images: Boolean. If set True and the given input is either image,
            filepath, or filepaths, it will return arrays of [mean, variance, entropy]
            instead of images of them. Also, if the input is array, it will
            simply return array, whether or not this flag is True or False.
        n_samples: The number of sampling. If set as 1, it will just return the 
            single prediction value.
        normalizer: callable, function that accepts an ndarray and returns an
            ndarray. Called before separating volume into blocks.
        batch_size: int, number of sub-volumes per batch for prediction.
        dtype: str or dtype object, dtype of features.

    Returns:
        `nibabel.spatialimages.SpatialImage` or arrays of predictions of 
        mean, variance(optional), and entropy (optional).
    """
    if not Path(filepath[0]).is_file():
        raise FileNotFoundError("could not find file {}".format(filepath[0]))
    img = nib.load(filepath[0])
    y = read_volume(filepath[1], dtype=np.int32)

    outputs = _predict(inputs=img,
                       predictor=predictor,
                       block_shape=block_shape,
                       return_variance=return_variance,
                       return_entropy=return_entropy,
                       return_array_from_images=return_array_from_images,
                       n_samples=n_samples,
                       normalizer=normalizer,
                       batch_size=batch_size)
    prediction_image = outputs[0].get_data()
    y = replace(y, read_mapping(mapping_y))
    dice = get_dice_for_images(prediction_image, y, n_classes)
    return outputs, dice