def test_train_model_lr_schedule(self):
        initial_lr = 0.1 * hvd.size()
        opt = tf.keras.optimizers.Adam()
        opt = hvd.DistributedOptimizer(opt)

        def linear_multiplier(epoch):
            return epoch

        model = keras.models.Sequential()
        model.add(keras.layers.Dense(2, input_shape=(3, )))
        model.add(keras.layers.RepeatVector(3))
        model.add(keras.layers.ThresholdedReLU(0.5))
        model.compile(loss=keras.losses.mean_squared_error,
                      optimizer=opt,
                      metrics=[keras.metrics.categorical_accuracy],
                      experimental_run_tf_function=False)
        x = np.random.random((10, 3))
        y = np.random.random((10, 3, 2))

        class StoreLearningRateCallback(tf.keras.callbacks.Callback):
            def on_epoch_end(self, epoch, logs=None):
                # test learning rate warmup
                lr = self.model.optimizer.lr.numpy()
                if epoch >= 0 and epoch < 5:
                    assert lr <= initial_lr or np.isclose(lr, initial_lr)

                # # test learning rate schedule callback
                if epoch > 5 and epoch < 10:
                    assert lr <= initial_lr * \
                        1e-1 or np.isclose(lr, initial_lr * 1e-1)
                if epoch > 10 and epoch < 15:
                    assert lr < initial_lr * \
                        1e-2 or np.isclose(lr, initial_lr * 1e-2)
                if epoch >= 15 and epoch < 20:
                    assert np.isclose(lr,
                                      initial_lr * linear_multiplier(epoch))

        # No assertions needed for BroadcastGlobalVariableCallbacks
        # We just need to verify that it doesn't hang or error
        callbacks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            hvd.callbacks.MetricAverageCallback(),
            hvd.callbacks.LearningRateWarmupCallback(initial_lr=initial_lr,
                                                     warmup_epochs=5),
            hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr,
                                                       multiplier=1e-1,
                                                       start_epoch=5,
                                                       end_epoch=10),
            hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr,
                                                       multiplier=1e-2,
                                                       start_epoch=10,
                                                       end_epoch=15),
            hvd.callbacks.LearningRateScheduleCallback(
                initial_lr=initial_lr,
                multiplier=linear_multiplier,
                start_epoch=15,
                end_epoch=20),
            StoreLearningRateCallback()
        ]
        train_history = model.fit(x,
                                  y,
                                  steps_per_epoch=5,
                                  callbacks=callbacks,
                                  epochs=20)

        # test that the metrics average is being respected
        loss_metrics = train_history.history["loss"]
        loss_metrics_tensor = tf.convert_to_tensor(loss_metrics,
                                                   dtype=tf.float32)
        expected_loss_metrics_tensor = hvd.broadcast(loss_metrics_tensor,
                                                     root_rank=0)
        self.assertAllClose(expected_loss_metrics_tensor, loss_metrics_tensor)
resume_from_epoch = 0

if args.use_checkpointing:

    #checkpointing should only be done on the root worker.

    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format))
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

    resume_from_epoch = restart_epoch(args)

    #broadcast `resume_from_epoch` from first process to all others

    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0)


# Create/load the model.
model = create_model(resume_from_epoch)

# Train the model.
model.fit_generator(train_iter,
                    #keep the total number of steps the same despite of an increased number of workers
                    steps_per_epoch=len(train_iter) // hvd.size(),
                    callbacks=callbacks,
                    epochs=args.epochs,
                    verbose=verbose,
                    workers=4,
                    initial_epoch=resume_from_epoch,
                    validation_data=test_iter,  
Beispiel #3
0
import tensorflow as tf

import horovod.tensorflow.keras as hvd
hvd.init()

# Ensure only 1 process downloads the data on each node
if hvd.local_rank() == 0:
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
    hvd.broadcast(0, 0)
else:
    hvd.broadcast(0, 0)
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Data partition for different workers
num_pics_per_rank = x_train.shape[0] // hvd.size()
pic_begin = num_pics_per_rank * hvd.rank()
pic_end = pic_begin + num_pics_per_rank
x_train = x_train[pic_begin:pic_end, ]
y_train = y_train[pic_begin:pic_end, ]

x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(10),
])

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Using hvd.size()(number of workers) to scale learning rate and wrapping
# optimizer with Distributed optimizer class provided by horovod.
Beispiel #4
0
                        .map(preprocess, num_parallel_calls=AUTOTUNE)
                        .batch(args.val_batch_size))
    
# Look for a pre-existing checkpoint from which to resume training
existing_checkpoints_dir = pathlib.Path(args.read_checkpoints_from)
checkpoint_filepath = None
initial_epoch = 0
for _most_recent_epoch in range(args.epochs, 0, -1):
    _checkpoint_filepath = f"{existing_checkpoints_dir}/checkpoint-epoch-{_most_recent_epoch:02d}.h5"
    if os.path.exists(_checkpoint_filepath):
        checkpoint_filepath = _checkpoint_filepath
        initial_epoch = _most_recent_epoch
        break
        
# make sure that all workers agree to resume training from the same epoch
intial_epoch = hvd.broadcast(initial_epoch, root_rank=0, name='initial_epoch')

_loss_fn = (keras.losses
                 .CategoricalCrossentropy())
    
# adjust initial learning rate based on number of "effective GPUs".
_global_batch_size = args.batch_size * hvd.size()
_n_effective_gpus = _global_batch_size // args.base_batch_size 
_initial_lr = args.base_lr * _n_effective_gpus 
_optimizer = (keras.optimizers
                   .SGD(lr=_initial_lr, momentum=args.momentum))
_distributed_optimizer = hvd.DistributedOptimizer(_optimizer)

_metrics = [
    keras.metrics.CategoricalAccuracy(),
    keras.metrics.TopKCategoricalAccuracy(k=5)