Exemple #1
0
def convert(*, csv, tfrecords_template, volume_shape, volumes_per_shard,
            to_ras, gzip, verify_volumes, num_parallel_calls, verbose):
    """Convert medical imaging volumes to TFRecords.

    Volumes must all be the same shape. This will overwrite existing TFRecords files.
    """
    # TODO: improve docs.
    volume_filepaths = _read_csv(csv)
    num_parallel_calls = None if num_parallel_calls == -1 else num_parallel_calls
    if num_parallel_calls is None:
        num_parallel_calls = _get_all_cpus()

    _dirname = os.path.dirname(tfrecords_template)
    if not os.path.exists(_dirname):
        raise ValueError("directory does not exist: {}".format(_dirname))

    if verify_volumes:
        invalid_pairs = _verify_features_labels(
            volume_filepaths=volume_filepaths,
            volume_shape=volume_shape,
            check_shape=True,
            check_labels_int=True,
            check_labels_gte_zero=True,
            num_parallel_calls=None,
            verbose=1)

        if not invalid_pairs:
            click.echo(click.style('Passed verification.', fg='green'))
        else:
            click.echo(click.style('Failed verification.', fg='red'))
            click.echo(
                "Found {} invalid pairs of volumes. These files might not all have shape {}, the labels might not be an integer type or coercible to integer type, or the labels might not be >= 0."
                .format(len(invalid_pairs), volume_shape))
            for pair in invalid_pairs:
                click.echo(pair[0])
                click.echo(pair[1])
            sys.exit(-1)

    _convert(volume_filepaths=volume_filepaths,
             tfrecords_template=tfrecords_template,
             volumes_per_shard=volumes_per_shard,
             to_ras=to_ras,
             gzip_compressed=gzip,
             num_parallel_calls=num_parallel_calls,
             verbose=verbose)

    click.echo(click.style('Finished conversion to TFRecords.', fg='green'))
Exemple #2
0
def convert(volume_filepaths,
            tfrecords_template="tfrecords/data_shard-{shard:03d}.tfrecords",
            volumes_per_shard=100,
            to_ras=True,
            gzip_compressed=True,
            num_parallel_calls=None,
            verbose=1):
    """Convert a list of features and labels volumes to TFRecords. The volume
    data and the affine for each volume are saved, so the original images can
    be recreated. This method uses multiple cores but can still take a considerable
    amount of time for large datasets. Gzip compression should be used, as it
    dramatically decreases the size of the resulting tfrecords file.

    This method will save multiple TFRecords files if the length of `volume_filepaths` is
    greater than `volumes_per_shard`. Sharding the data into multiple TFRecords files
    is beneficial for at least two reasons:

        1. Allows for better shuffling of large datasets because we can shuffle
            the files and not only the underlying data.
        2. Enables use of parallel reading of TFRecords files with the `tf.data` API,
            which can decrease overall data processing time.

    For example, if one is converting 100 pairs of files (i.e., length of
    `volume_filepaths` is 100) and `volumes_per_shard` is 40, then three
    TFRecords files will be created. The first 40 pairs of files will be in the
    first shard, the next 40 pairs will be in the second shard, and the last
    20 pairs will be in the third shard.

    Parameters
    ----------
    volume_filepaths: nested list. Every sublist in the list should contain two
        items: path to features volume and path to labels volume, in that order.
    tfrecords_template: string template, path to save new tfrecords file with the string formatting
        key 'shard'. The shard index is entered
        Extension should be '.tfrecords'.
    volumes_per_shard: int, number of pairs of volumes per tfrecords file.
    to_ras: boolean, if true, reorient volumes to RAS with `nibabel.as_closest_canonical`.
    gzip_compressed: boolean, if true, compress data with Gzip. This is highly
        recommended, as it dramatically reduces file size.
    num_parallel_calls: int, number of processes to use for multiprocessing. If
        None, will use all available processes.
    verbose: {0, 1, 2}, verbosity of the progress bar. 0 is silent, 1 is verbose,
        and 2 is semi-verbose.

    Returns
    -------
    None
    """
    try:
        tfrecords_template.format(shard=3)
    except Exception:
        raise ValueError(
            "invalid 'tfrecords_template'. This template must contain the key 'shard'."
        )

    tfrecords_template = os.path.abspath(tfrecords_template)
    _dirname = os.path.dirname(tfrecords_template)
    if not os.path.exists(_dirname):
        raise ValueError("directory does not exist: {}".format(_dirname))

    n_shards = math.ceil(len(volume_filepaths) / volumes_per_shard)
    # Include the unique tfrecords filepath for each file, because that's what
    # the map function expects.
    volume_filepaths_shards = [[
        tfrecords_template.format(shard=idx),
        shard.tolist()
    ] for idx, shard in enumerate(np.array_split(volume_filepaths, n_shards))]
    map_fn = functools.partial(_convert,
                               to_ras=to_ras,
                               gzip_compressed=gzip_compressed)

    print("Converting {} pairs of files to {} TFRecords.".format(
        len(volume_filepaths), len(volume_filepaths_shards)))
    progbar = tf.keras.utils.Progbar(len(volume_filepaths_shards),
                                     verbose=verbose)
    progbar.update(0)
    if num_parallel_calls is None:
        num_parallel_calls = _get_all_cpus()

    if num_parallel_calls == 1:
        for vf in volume_filepaths_shards:
            map_fn(vf)
            progbar.add(1)
    else:
        with multiprocessing.Pool(num_parallel_calls) as p:
            for _ in p.imap(map_fn, volume_filepaths_shards, chunksize=2):
                progbar.add(1)
Exemple #3
0
def verify_features_labels(volume_filepaths,
                           volume_shape=(256, 256, 256),
                           check_shape=True,
                           check_labels_int=True,
                           check_labels_gte_zero=True,
                           num_parallel_calls=None,
                           verbose=1):
    """Verify a list of files. This function is meant to be run before
    converting volumes to TFRecords.

    Parameters
    ----------
    volume_filepaths: nested list. Every sublist in the list should contain two
        items: path to features volume and path to labels volume, in that order.
    volume_shape: tuple of three ints. Shape that both volumes should be.
    check_shape: boolean, if true, validate that the shape of both volumes is
        equal to 'volume_shape'.
    check_labels_int: boolean, if true, validate that every labels volume is an
        integer type or can be safely converted to an integer type.
    check_labels_gte_zero: boolean, if true, validate that every labels volume
        has values greater than or equal to zero.
    num_parallel_calls: int, number of processes to use for multiprocessing. If
        None, will use all available processes.
    verbose: {0, 1, 2}, verbosity of the progress bar. 0 is silent, 1 is verbose,
        and 2 is semi-verbose.

    Returns
    -------
    List of invalid pairs of filepaths. If the list is empty, all filepaths are
    valid.
    """

    for pair in volume_filepaths:
        if len(pair) != 2:
            raise ValueError(
                "all items in 'volume_filepaths' must have length of 2, but"
                " found at least one item with lenght != 2.")
        if not os.path.exists(pair[0]):
            raise ValueError("file does not exist: {}".format(pair[0]))
        if not os.path.exists(pair[1]):
            raise ValueError("file does not exist: {}".format(pair[1]))

    print("Verifying {} pairs of volumes".format(len(volume_filepaths)))
    progbar = tf.keras.utils.Progbar(len(volume_filepaths), verbose=verbose)
    progbar.update(0)
    map_fn = functools.partial(_verify_features_labels_pair,
                               volume_shape=volume_shape,
                               check_shape=check_shape,
                               check_labels_int=check_labels_int,
                               check_labels_gte_zero=check_labels_gte_zero)
    if num_parallel_calls is None:
        num_parallel_calls = _get_all_cpus()

    outputs = []
    if num_parallel_calls == 1:
        for vf in volume_filepaths:
            valid = map_fn(vf)
            outputs.append(valid)
            progbar.add(1)
    else:
        with multiprocessing.Pool(num_parallel_calls) as p:
            for valid in p.imap(map_fn, volume_filepaths, chunksize=2):
                outputs.append(valid)
                progbar.add(1)
    invalid_files = [
        pair for valid, pair in zip(outputs, volume_filepaths) if not valid
    ]
    return invalid_files
Exemple #4
0
def test_get_all_cpus():
    assert nbutils._get_all_cpus() == multiprocessing.cpu_count()
    os.environ['SLURM_CPUS_ON_NODE'] = "128"
    assert nbutils._get_all_cpus() == 128
Exemple #5
0
def train(*, model, tfrecords_pattern, n_classes, batch_size, volume_shape,
          block_shape, n_epochs, initial_epoch, n_volumes, loss, learning_rate,
          shuffle_buffer_size, mapping, augment, model_kwds, multi_gpu,
          num_parallel_calls):
    """Train a model."""

    if num_parallel_calls == -1:
        num_parallel_calls = _get_all_cpus()

    dataset = _get_dataset(file_pattern=tfrecords_pattern,
                           n_classes=n_classes,
                           batch_size=batch_size,
                           volume_shape=volume_shape,
                           block_shape=block_shape,
                           n_epochs=n_epochs,
                           mapping=mapping,
                           augment=augment,
                           shuffle_buffer_size=shuffle_buffer_size,
                           num_parallel_calls=num_parallel_calls)

    steps_per_epoch = _get_steps_per_epoch(n_volumes=n_volumes,
                                           volume_shape=volume_shape,
                                           block_shape=block_shape,
                                           batch_size=batch_size)

    # TODO: validation dataset

    if multi_gpu:
        # As of March 05, 2019, keras optimizers only have experimental support
        # with MirroredStrategy training.
        optimizer = tf.train.AdamOptimizer(learning_rate)
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate)

    # Searches custom nobrainer losses as well as standard tf.keras losses.
    loss = _get_loss(loss)
    # Instantiate loss object if it is a class.
    if inspect.isclass(loss):
        loss = loss()

    if model_kwds is None:
        model_kwds = {}

    click.echo('Beginning to train model in directory {}'.format(os.getcwd()))
    click.echo('Parameters:')
    pprint.pprint({
        'model': model,
        'tfrecords_pattern': tfrecords_pattern,
        'n_classes': n_classes,
        'batch_size': batch_size,
        'volume_shape': volume_shape,
        'block_shape': block_shape,
        'n_epochs': n_epochs,
        'initial_epoch': initial_epoch,
        'n_volumes': n_volumes,
        'loss': loss,
        'learning_rate': learning_rate,
        'shuffle_buffer_size': shuffle_buffer_size,
        'mapping': mapping,
        'augment': augment,
        'model_kwds': model_kwds,
        'multi_gpu': multi_gpu,
        'num_parallel_calls': num_parallel_calls
    })

    history = _train(
        model=model,
        dataset=dataset,
        optimizer=optimizer,
        loss=loss,
        steps_per_epoch=steps_per_epoch,
        model_kwds=model_kwds,
        n_epochs=n_epochs,
        initial_epoch=initial_epoch,
        # Use default metrics and callbacks.
        metrics=None,
        callbacks=None,
        multi_gpu=multi_gpu,
        devices=None)

    click.echo(click.style('Finished training model.', fg='green'))