def test_horovod_adasum_multiple_allreduce_gpu_nccl(self):
        """Test on GPU using NCCL that the Adasum correctly computes 2D tensors."""
        hvd.init()
        # TODO support non-MPI Adasum operation
        if not hvd.mpi_enabled() or not hvd.gpu_available(
                'tensorflow') or not hvd.nccl_built():
            self.skipTest("MPI, GPU or NCCL not available")

        rank = hvd.rank()
        rank_tensors = []
        size = hvd.size()
        # TODO support testing with non-power 2 ranks
        if not is_power2(size):
            self.skipTest("MPI rank is not power of 2")

        local_size = hvd.local_size()

        # Only run on homogeneous cluster
        if not hvd.is_homogeneous():
            self.skipTest("Horovod cluster is not homogeneous")

        num_nodes = int(size / local_size)
        for _ in range(size):
            rank_tensors.append([
                np.random.random_sample((2, 2)),
                np.random.random_sample((2, 2))
            ])
        sum_local_ranks_tensor = []
        for i in range(num_nodes):
            sum_local_ranks_tensor.append([np.zeros((2, 2)), np.zeros((2, 2))])
            for j in range(local_size):
                sum_local_ranks_tensor[i] = np.add(sum_local_ranks_tensor[i],
                                                   rank_tensors[j])

        answer = reference_tree_reduction(sum_local_ranks_tensor, num_nodes)
        answer = np.true_divide(answer, local_size)
        for dtype in [tf.float16, tf.float32, tf.float64]:
            with tf.device("/gpu:{}".format(hvd.local_rank())):
                tensors = map(tf.constant, rank_tensors[rank])
                # cast to the corresponding dtype
                tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors)
                # and away we go: do reduction
                reduced_tensors = [
                    self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum))
                    for tensor in tensors
                ]
                # cast expected result to the type of the tensorflow values
                np_type = dtype.as_numpy_dtype
                tmp = [t.astype(np_type) for t in answer]
                self.assertAllCloseAccordingToType(tmp, reduced_tensors)
Example #2
0
def log_final_result(value, error):
    if hvd.rank() > 0:
        return
    import horovod
    attrs = {
        'framework': 'horovod',
        'version': horovod.__version__,
        'np': hvd.size(),
        'bs': args.batch_size,
        'model': args.model,
    }
    try:
        attrs['nccl_built'] = hvd.nccl_built()
    except:
        pass
    log_detailed_result(value, error, attrs)
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    lr_scaler = hvd.size()
    # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
    # scale lr by local_size
    if args.use_adasum:
        lr_scaler = hvd.local_size() if hvd.nccl_built() else 1

    # Horovod: adjust learning rate based on lr_scaler.
    opt = tf.train.AdamOptimizer(args.lr * lr_scaler)

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(
        opt, op=hvd.Adasum if args.use_adasum else hvd.Average)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=args.num_steps // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Example #4
0
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    config.gpu_options.allow_growth = False
    config.gpu_options.visible_device_list = ''

if args.eager:
    tf.enable_eager_execution(config)

# Set up standard model.
model = getattr(applications, args.model)(weights=None)

lr_scaler = hvd.size()
# By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
# scale lr by local_size
if args.use_adasum:
    lr_scaler = hvd.local_size() if args.cuda and hvd.nccl_built() else 1

opt = tf.train.GradientDescentOptimizer(0.01 * lr_scaler)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
opt = hvd.DistributedOptimizer(
    opt,
    compression=compression,
    op=hvd.Adasum if args.use_adasum else hvd.Average)

init = tf.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)