コード例 #1
0
                                               initial_lr=initial_lr),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1, initial_lr=initial_lr),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2, initial_lr=initial_lr),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3, initial_lr=initial_lr),
]

# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
if hvd.rank() == 0:
    callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format))
    callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

# Train the model. The training will randomly sample 1 / N batches of training data and
# 3 / N batches of validation data on every worker, where N is the number of workers.
# Over-sampling of validation data helps to increase probability that every validation
# example will be evaluated.
model.fit_generator(train_iter,
                    steps_per_epoch=len(train_iter) // hvd.size(),
                    callbacks=callbacks,
                    epochs=args.epochs,
                    verbose=verbose,
                    workers=4,
                    initial_epoch=resume_from_epoch,
                    validation_data=test_iter,
                    validation_steps=3 * len(test_iter) // hvd.size())

# Evaluate the model on the full data set.
score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=4))
if verbose:
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
コード例 #2
0
ファイル: cifar10.py プロジェクト: gautamsharma0095/arsenal
def train_model(model, xy_train, xy_test,
                tensorboard_dir,
                data_augmentation=False, epochs=200, batch_size=32):

    x_train, y_train = xy_train
    x_test, y_test = xy_test
    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train, y_train,
                batch_size=batch_size,
                epochs=epochs,
                validation_data=(x_test, y_test),
                verbose=2,
                shuffle=True)
    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
            width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
            height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        ]

        verbose = 0

        # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
        if hvd.rank() == 0:
            checkpoint = os.path.join(OUTPUT_DIR,
                                      'checkpoint-{epoch}.h5')
            callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint))
            callbacks.append(keras.callbacks.TensorBoard(log_dir=tensorboard_dir))
            verbose = 2

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train, y_train,
                                        batch_size=batch_size),
                            steps_per_epoch=x_train.shape[0] // batch_size,
                            epochs=epochs,
                            verbose=verbose,
                            callbacks=callbacks,
                            validation_data=(x_test, y_test))

        # Evaluate model with test data set and share sample prediction results
        evaluation = hvd.allreduce(model.evaluate_generator(datagen.flow(x_test, y_test,
                                                                         batch_size=batch_size),
                                                            steps=x_test.shape[0] // batch_size))
        if hvd.rank() == 0:
            print('Model Accuracy = %.2f' % (evaluation[1]))
            riseml.report_result(accuracy=float(evaluation[1]))
コード例 #3
0
def main(args):    
    #initialize Horovod.
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    fold = args.data_path.split("fold_")[1]
    if hvd.rank()==0:
        print("================================")
        if args.use_lovasz:
            print("Fine tuning with ")
        print("Fold {}".format(fold))
        
    #Find best saved model
    best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}')
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(best_model_file.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break
    if hvd.rank()==0:
        print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch)))
    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
    #verbose mode for one node
    if hvd.rank()==0:
        verbose = 1
    else:
        verbose = 0
   
    #Create dataset
    
    dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size)
    input_shape = (args.target_size, args.target_size)
    mask_shape = (101, 101)
    train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    train_step_size = dataset.train_step_size // hvd.size()
    val_step_size = dataset.val_step_size // hvd.size()
    #Create model
    model = make_model(args.model, (args.target_size, args.target_size, 3), 2)

    #load weights
    if resume_from_epoch > 0:
        model.load_weights(best_model_file.format(epoch=resume_from_epoch))
        
    size = hvd.size()
    opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True))

    #Loss
    loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy
    
    model.compile(loss=loss,
                  optimizer=opt,
                  metrics=[metrics.c_binary_accuracy, metrics.c_iou])

    #h5 model
    best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss',
                                     verbose=1,
                                     mode='min',
                                     period=1,
                                     save_best_only=True,
                                     save_weights_only=True)
    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True)
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))
        callbacks.append(best_model)
    
    #Fit model
    history = model.fit_generator(train_data_generator,
                        steps_per_epoch=train_step_size,
                        callbacks=callbacks,
                        epochs=args.epochs,
                        verbose=verbose,
                        workers=4,
                        initial_epoch=resume_from_epoch,
                        validation_data=val_data_generator,
                        validation_steps=val_step_size)
  

    score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4))
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
コード例 #4
0
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=warmup_epochs, end_epoch=30, multiplier=1.),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
]

# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
if hvd.rank() == 0:
    callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
    callbacks.append(keras.callbacks.TensorBoard(log_dir))

# Train the model. The training will randomly sample 1 / N batches of training data and
# 3 / N batches of validation data on every worker, where N is the number of workers.
# Over-sampling of validation data helps to increase probability that every validation
# example will be evaluated.
model.fit_generator(train_iter,
                    steps_per_epoch=len(train_iter) // hvd.size(),
                    callbacks=callbacks,
                    epochs=epochs,
                    verbose=verbose,
                    workers=4,
                    initial_epoch=resume_from_epoch,
                    validation_data=test_iter,
                    validation_steps=3 * len(test_iter) // hvd.size())

# Evaluate the model on the full data set.
score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=4))
if verbose:
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
コード例 #5
0
def main():
    verbose = 1
    logger = _get_logger()
    if _DISTRIBUTED:
        # Horovod: initialize Horovod.
        hvd.init()
        logger.info("Runnin Distributed")
        verbose = 1 if hvd.rank() == 0 else 0

    logger.info("Tensorflow version {}".format(tf.__version__))
    K.set_session(tf.Session(config=_get_runconfig()))

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    resume_from_epoch = 0
    if _DISTRIBUTED:
        resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                          0,
                                          name="resume_from_epoch")

    if _FAKE:
        train_iter = _fake_data_iterator_from()
    else:
        train_iter = _training_data_iterator_from()
        test_iter = _validation_data_iterator_from() if _VALIDATION else None

    model = _create_model()

    params = {"learning_rate": _LR, "momentum": 0.9}

    opt = _get_optimizer(params)
    model.compile(
        loss=keras.losses.categorical_crossentropy,
        optimizer=opt,
        metrics=["accuracy", "top_k_categorical_accuracy"],
    )

    model_dir = _get_model_dir()
    checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5")

    callbacks = _get_hooks()
    callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE))

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if _is_master():
        callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
        # callbacks.append(keras.callbacks.TensorBoard(log_dir))

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # Horovod: restore on the first worker which will broadcast weights to other workers.
    if resume_from_epoch > 0 and _is_master():
        model.load_weights(checkpoint_format.format(epoch=resume_from_epoch))

    logger.info("Training...")
    # Train the model. The training will randomly sample 1 / N batches of training data and
    # 3 / N batches of validation data on every worker, where N is the number of workers.
    # Over-sampling of validation data helps to increase probability that every validation
    # example will be evaluated.
    num_workers = hvd.size() if _DISTRIBUTED else 1
    model.fit_generator(
        train_iter,
        steps_per_epoch=len(train_iter) // num_workers,
        callbacks=callbacks,
        epochs=_EPOCHS,
        verbose=verbose,
        workers=_NUM_WORKERS,
        max_queue_size=_MAX_QUEUE_SIZE,
        use_multiprocessing=_MULTIPROCESSING,
        initial_epoch=resume_from_epoch,
    )

    if _FAKE is False and _VALIDATION:
        # Evaluate the model on the full data set.
        with Timer(output=logger.info, prefix="Testing"):
            logger.info("Testing...")
            score = hvd.allreduce(
                model.evaluate_generator(test_iter, len(test_iter),
                                         workers=10))
            if verbose:
                print("Test loss:", score[0])
            print("Test accuracy:", score[1])
コード例 #6
0
ファイル: allreduce2.py プロジェクト: shykoe/wenzheng
#          \date   2019-07-30 17:05:04.755084
#   \Description    nc horovodrun -np 2  python allreduce.py
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os

import horovod.keras as hvd
import numpy as np
hvd.init()
hvd_r = int(hvd.rank())
assert hvd.size() == 2
#each process compute a small part of something and then compute the average etc.
test_array = np.array(range(100))
#compute a small part
span = int(100 / hvd.size())
#x=np.mean(test_array[hvd_r * span: (hvd_r + 1) * span])

x = test_array[hvd_r * span:(hvd_r + 1) * span]

#compute the average for all processes
y = hvd.allreduce(x, average=False)

#only one process print out the result
if (hvd_r == 0):
    print(y, len(y), sum(y))
コード例 #7
0
def main():
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.compat.v1.Session(config=config))

    # If set > 0, will resume training from a given checkpoint.
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                      0,
                                      name='resume_from_epoch')

    # Horovod: print logs on the first worker.
    verbose = 1 if hvd.rank() == 0 else 0

    # Training data iterator.
    train_gen = image.ImageDataGenerator()
    #width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
    #preprocessing_function=keras.applications.resnet50.preprocess_input)
    train_iter = train_gen.flow_from_directory(args.train,
                                               batch_size=args.batch_size,
                                               target_size=(224, 224))

    # Validation data iterator.
    test_gen = image.ImageDataGenerator()
    #zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
    test_iter = test_gen.flow_from_directory(args.val,
                                             batch_size=args.val_batch_size,
                                             target_size=(224, 224))

    # train iterator for tfrecord
    train_iter_tf = iterator(args.train_dir)
    val_iter_tf = iterator(args.val_dir)

    # timeline
    #timeline = tf.train.ProfilerHook(save_steps=500, output_dir='./timeline')
    #run_options  = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
    #run_metadata = tf.compat.v1.RunMetadata()

    # Set up standard ResNet-50 model.
    model = keras.applications.resnet50.ResNet50(weights=None)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # Horovod: restore on the first worker which will broadcast both model and optimizer weights
    # to other workers.
    if resume_from_epoch > 0 and hvd.rank() == 0:
        model = hvd.load_model(
            args.checkpoint_format.format(epoch=resume_from_epoch),
            compression=compression)
    else:
        # ResNet-50 model that is included with Keras is optimized for inference.
        # Add L2 weight decay & adjust BN settings.
        model_config = model.get_config()
        for layer, layer_config in zip(model.layers, model_config['layers']):
            if hasattr(layer, 'kernel_regularizer'):
                regularizer = keras.regularizers.l2(args.wd)
                layer_config['config']['kernel_regularizer'] = \
                    {'class_name': regularizer.__class__.__name__,
                     'config': regularizer.get_config()}
            if type(layer) == keras.layers.BatchNormalization:
                layer_config['config']['momentum'] = 0.9
                layer_config['config']['epsilon'] = 1e-5

        model = keras.models.Model.from_config(model_config)

        # Horovod: adjust learning rate based on number of GPUs.
        opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(),
                                   momentum=args.momentum)

        # Horovod: add Horovod Distributed Optimizer.
        opt = hvd.DistributedOptimizer(opt, compression=compression)

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=opt,
                      metrics=['accuracy', 'top_k_categorical_accuracy'])
        #              options=run_options,
        #              run_metadata=run_metadata
        #              )

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=args.warmup_epochs, verbose=verbose),

        # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
        hvd.callbacks.LearningRateScheduleCallback(
            start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=30,
                                                   end_epoch=60,
                                                   multiplier=1e-1),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=60,
                                                   end_epoch=80,
                                                   multiplier=1e-2),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=80,
                                                   multiplier=1e-3),
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(args.checkpoint_format))
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

    # Train the model. The training will randomly sample 1 / N batches of training data and
    # 3 / N batches of validation data on every worker, where N is the number of workers.
    # Over-sampling of validation data helps to increase probability that every validation
    # example will be evaluated.

    print('----  train  len------ :', len(train_iter))
    print('----  test   len------ :', len(test_iter))
    total_train_step = len(train_iter)
    total_val_step = len(test_iter)

    #model.fit_generator(train_iter,
    model.fit(
        train_iter_tf,
        #steps_per_epoch=40037 // hvd.size(),
        steps_per_epoch=total_train_step // hvd.size(),
        callbacks=callbacks,
        epochs=args.epochs,
        verbose=verbose,
        workers=8,
        initial_epoch=resume_from_epoch,
        validation_data=val_iter_tf,
        validation_steps=3 * total_val_step // hvd.size())

    # timeline tracing
    #trace = timeline.Timeline(step_stats=run_metadata.step_stats)
    #with open ('./timeline.keras.json','w') as f:
    #     f.write(trace.generate_chrome_trace_format())

    # Evaluate the model on the full data set.
    score = hvd.allreduce(
        model.evaluate_generator(test_iter, len(test_iter), workers=4))
    if verbose:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
コード例 #8
0
def main(_):
    '''Main routine for Horovod Tensorflow Mnist example.'''
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=str(hvd.local_rank()))
    config = tf.ConfigProto(gpu_options=gpu_options)

    batch_size = 100

    # Download and load MNIST dataset.
    if hvd.rank() == 0:
        # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR)
        image, label = get_data_mnist(batch_size)

    # hvd.allreduce(tf.constant([0]), average=False)  # Barrier (not working)
    with tf.Session(config=config):
        # download/unzip in rank 0 only.
        hvd_keras.allreduce([0], name="Barrier")

    if hvd.rank() != 0:
        # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR)
        image, label = get_data_mnist(batch_size)

    # Build model...
    # with tf.name_scope('input'):
    #     image = tf.placeholder(tf.float32, [None, 784], name='image')
    #     label = tf.placeholder(tf.float32, [None], name='label')

    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    # global_step = tf.contrib.framework.get_or_create_global_step()
    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable
        # states from rank 0 to all other processes. This is necessary to
        # ensure consistent initialization of all workers when training is
        # started with random weights or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when
    # done or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            # image_, label_ = mnist.train.next_batch(100)
            # mon_sess.run(train_op, feed_dict={image: image_, label: label_})
            mon_sess.run(train_op)
コード例 #9
0
ファイル: allreduce.py プロジェクト: shykoe/wenzheng
#          \date   2019-07-30 17:05:04.755084
#   \Description    nc horovodrun -np 2  python allreduce.py
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os

import horovod.keras as hvd
import numpy as np

hvd.init()
hvd_r = int(hvd.rank())
assert hvd.size() == 2
#each process compute a small part of something and then compute the average etc.
test_array = np.array(range(100))
#compute a small part
span = int(100 / hvd.size())
x = np.mean(test_array[hvd_r * span:(hvd_r + 1) * span])
#x=test_array[hvd_r * span: (hvd_r + 1) * span]

#compute the average for all processes
y = hvd.allreduce(x)

#only one process print out the result
if (hvd_r == 0):
    print("mean of the big array is %f" % y)
コード例 #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Keras Fashion MNIST Example',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--log-dir', default='./logs',
                        help='tensorboard log directory')
    parser.add_argument('--batch-size', type=int, default=32,
                        help='input batch size for training')
    parser.add_argument('--val-batch-size', type=int, default=32,
                        help='input batch size for validation')
    parser.add_argument('--epochs', type=int, default=40,
                        help='number of epochs to train')
    parser.add_argument('--base-lr', type=float, default=0.01,
                        help='learning rate for a single GPU')
    parser.add_argument('--momentum', type=float, default=0.9,
                        help='SGD momentum')
    parser.add_argument('--wd', type=float, default=0.000005,
                        help='weight decay')
    # TODO: Step 9 part 1: register `--warmup-epochs`
    parser.add_argument('--warmup-epochs', type=float, default=5,
                        help='number of warmup epochs')

    GRAPHDEF_FILE = 'graphdef'
    parser.add_argument(
        '--savegraph', action='store', nargs='?',
        const=GRAPHDEF_FILE,
        help='Save graphdef pb and pbtxt files. '
        '(default: {})'.format(GRAPHDEF_FILE))

    parser.add_argument(
        '--profrun', action='store_true',
        help='Run for nsys/dlprof profiling. Runs only a few steps.')

    args = parser.parse_args()

    # Checkpoints will be written in the log directory.
    args.checkpoint_format = \
        os.path.join(args.log_dir, 'checkpoint-{epoch}.h5')

    print('AMP MIXED', os.environ.get("TF_ENABLE_AUTO_MIXED_PRECISION"))

    # TODO: Step 2 work here: initialize horovod
    hvd.init()

    # TODO: Step 3 work here: pin GPUs
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    # If set > 0, will resume training from a given checkpoint.
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break

    # TODO: Step 4 work here: broadcast `resume_from_epoch` from first process
    # to all others
    with tf.Session(config=config):
        resume_from_epoch = \
            hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')

    # TODO: Step 5 work here: only set `verbose` to `1` if this is the
    # first worker
    verbose = 1 if hvd.rank() == 0 else 0

    # Input image dimensions
    img_rows, img_cols = 28, 28
    num_classes = 10

    # Download and load FASHION MNIST dataset.
    if hvd.rank() == 0:
        # Load Fashion MNIST data.
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

    with tf.Session(config=config):
        # download/unzip in rank 0 only.
        hvd.allreduce([0], name="Barrier")

    if hvd.rank() != 0:
        # Load Fashion MNIST data.
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

    if K.image_data_format() == 'channels_first':
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)

    # Convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # Training data iterator.
    train_gen = image.ImageDataGenerator(
        featurewise_center=True, featurewise_std_normalization=True,
        horizontal_flip=True, width_shift_range=0.2, height_shift_range=0.2)
    train_gen.fit(x_train)
    train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size)

    # Validation data iterator.
    test_gen = image.ImageDataGenerator(
        featurewise_center=True, featurewise_std_normalization=True)
    test_gen.mean = train_gen.mean
    test_gen.std = train_gen.std
    test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size)

    base_lr = args.base_lr
    LR = base_lr * hvd.size()

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # if resume_from_epoch > 0 and hvd.rank() == 0:
    if resume_from_epoch > 0:
        # TODO: Step 6 work here: only execute the `if` statement if this is
        # the first worker
        # If this is only done in rank 0 get following errors:
        #     horovod/common/operations.cc:764] One or more tensors were
        #     submitted to be reduced, gathered or broadcasted by subset of
        #     ranks and are waiting for remainder of ranks
        model = keras.models.load_model(
            args.checkpoint_format.format(epoch=resume_from_epoch))
    else:
        # Set up standard WideResNet-16-10 model.
        model = WideResidualNetwork(
            depth=16, width=10, weights=None, input_shape=input_shape,
            classes=num_classes, dropout_rate=0.01)

        # WideResNet model that is included with Keras is optimized for
        # inference. Add L2 weight decay & adjust BN settings.
        model_config = model.get_config()
        for layer, layer_config in zip(model.layers, model_config['layers']):
            if hasattr(layer, 'kernel_regularizer'):
                regularizer = keras.regularizers.l2(args.wd)
                layer_config['config']['kernel_regularizer'] = \
                    {'class_name': regularizer.__class__.__name__,
                     'config': regularizer.get_config()}
            if type(layer) == keras.layers.BatchNormalization:
                layer_config['config']['momentum'] = 0.9
                layer_config['config']['epsilon'] = 1e-5

        model = keras.models.Model.from_config(model_config)

        # TODO: Step 7 part 1 work here: increase the base learning rate by the
        # number of workers
        opt = keras.optimizers.SGD(
            lr=LR, momentum=args.momentum)

        # TODO: Step 7 part 2 work here: Wrap the optimizer in a Horovod
        # distributed optimizer
        opt_dist = hvd.DistributedOptimizer(opt)

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=opt_dist,
                      metrics=['accuracy'])

    def lr_schedule(epoch):
        # global LR
        if epoch < 15:
            return LR
        if epoch < 25:
            return 1e-1 * LR
        if epoch < 35:
            return 1e-2 * LR
        return 1e-3 * LR

    warmup_epochs = args.warmup_epochs
    callbacks = [
        # TODO: Step 8: broadcast initial variable states from the first
        # worker to all others
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # TODO: Step 12: average the metrics among workers at the end of every
        # epoch
        hvd.callbacks.MetricAverageCallback(),

        # TODO: Step 9 part 2: implement a LR warmup over `args.warmup_epochs`
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=warmup_epochs, verbose=verbose),

        # TODO: Step 9 part 3: replace with the Horovod learning rate
        # scheduler, taking care not to start until after warmup is complete
        hvd.callbacks.LearningRateScheduleCallback(
            lr_schedule, start_epoch=warmup_epochs)
    ]

    if hvd.rank() == 0:
        # TODO: Step 10: only append these 2 callbacks to `callbacks` if they
        # are to be executed by the first worker
        callbacks.append(
            keras.callbacks.ModelCheckpoint(args.checkpoint_format))
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

    # Train the model.
    number_of_workers = hvd.size()
    steps_per_epoch = len(train_iter) // number_of_workers
    validation_steps = 3 * len(test_iter) // number_of_workers

    # Train the model.
    if args.profrun:
        steps_per_epoch = 4

    model.fit_generator(train_iter,
                        # TODO: Step 11 part 1: keep the total number of steps
                        # the same in spite of an increased number of workers
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        epochs=args.epochs,
                        verbose=verbose,
                        workers=number_of_workers,
                        initial_epoch=resume_from_epoch,
                        validation_data=test_iter,
                        # TODO: Step 11 part 2: Set this value to be
                        # 3 * num_test_iterations / number_of_workers
                        validation_steps=validation_steps)

    # Evaluate the model on the full data set.
    score = model.evaluate_generator(test_iter, len(test_iter),
                                     workers=number_of_workers)

    if verbose:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])

    if hvd.rank() == 0 and args.savegraph:
        graphdef_file = args.savegraph

        session = K.get_session()
        graph_def = session.graph.as_graph_def()
        with open('{}.pb'.format(graphdef_file), 'wb') as f:
            f.write(graph_def.SerializeToString())
        with open('{}.pbtxt'.format(graphdef_file), 'w') as f:
            f.write(str(graph_def))
コード例 #11
0
def main(argv=None):
    '''Train a simple deep CNN on the CIFAR10 small images dataset on multigpu
    (and optionally multinode+multigpu) systems via Horovod implementation.
    '''
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__
    # CLI parser
    # args = parser_(argv[1:], desc)
    args = parser_(desc)

    # Initialize Horovod.
    hvd.init()

    logdevp = args.logdevp  # For debugging
    log_device_placement, allow_soft_placement = (True, True) \
        if _DEVPROF or logdevp else (False, False)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank,
                                                      gpu_local_rank))

    # Pin GPU to local rank. Typically one GPU per process unless
    # oversubscribing GPUs (experimental MPS). In model parallelism it's
    # possible to have multiple GPUs per process.
    # visible_device_list = str(hvd.local_rank()
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=str(gpu_local_rank))
    config = tf.ConfigProto(log_device_placement=log_device_placement,
                            allow_soft_placement=allow_soft_placement,
                            gpu_options=gpu_options)
    KB.set_session(tf.Session(config=config))

    hvdsize = hvd.size()

    checkpt = args.checkpt
    filepath = checkpt

    batch_size = args.batch_size
    num_classes = 10
    epochs = args.epochs

    datadir = args.datadir

    # The data, shuffled and split between train and test sets:
    if hvd.rank() == 0:
        # download only in rank0 i.e. single process
        (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir)

    hvd_keras.allreduce([0], name="Barrier")
    if hvd.rank() != 0:
        # Data should be downloaded already so load in the other ranks.
        (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir)

    train_samples = x_train.shape[0]
    test_samples = x_test.shape[0]
    steps_per_epoch = train_samples // batch_size // hvdsize

    print_rank0('{} train samples'.format(train_samples), hvd)
    print_rank0('{} test samples'.format(test_samples), hvd)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    if not args.use_dataset_api:
        traingen = ImageDataGenerator()
        if args.aug:
            print_rank0('Using real-time data augmentation.', hvd)
            # This will do preprocessing and realtime data augmentation:
            traingen = ImageDataGenerator(
                # set input mean to 0 over the dataset
                featurewise_center=False,
                # set each sample mean to 0
                samplewise_center=False,
                # divide inputs by std of the dataset
                featurewise_std_normalization=False,
                # divide each input by its std
                samplewise_std_normalization=False,
                # apply ZCA whitening
                zca_whitening=False,
                # randomly rotate images in the range (degrees, 0 to 180)
                rotation_range=0,
                # randomly shift images horizontally (fraction of total width)
                width_shift_range=0.1,
                # randomly shift images vertically (fraction of total height)
                height_shift_range=0.1,
                # randomly flip images
                horizontal_flip=True,
                # randomly flip images
                vertical_flip=False)

            # Compute quantities required for feature-wise normalization
            # (std, mean, and principal components if ZCA whitening is applied)
            traingen.fit(x_train)

        model = make_model(x_train.shape[1:], num_classes, filepath)
    else:
        print_rank0('USING TF DATASET API.', hvd)
        dataset = wrap_as_tfdataset(x_train,
                                    y_train,
                                    args.aug,
                                    batch_size,
                                    gpu_local_rank,
                                    prefetch_to_device=True,
                                    comm=hvd_keras)
        iterator = dataset.make_one_shot_iterator()

        # Model creation using tensors from the get_next() graph node.
        inputs, targets = iterator.get_next()
        x_train_input = KL.Input(tensor=inputs)

        model_init = make_model(x_train_input, num_classes, filepath)
        x_train_out = model_init.output

        model = Model(inputs=[x_train_input], outputs=[x_train_out])

    # Let's train the model using RMSprop
    lr = 0.0001 * hvdsize

    # opt = KO.RMSprop(lr=lr, decay=1e-6)
    # opt = hvd_keras.DistributedOptimizer(opt)

    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    model.compile(
        loss=keras_losses.categorical_crossentropy,
        optimizer=opt,
        metrics=['accuracy'],
        target_tensors=None if not args.use_dataset_api else [targets])

    if hvd.rank() == 0:
        model.summary()

    callbacks = []
    if checkpt and hvd.rank() == 0:
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='loss',
                                     mode='min',
                                     verbose=1,
                                     save_best_only=True)
        callbacks.append(checkpoint)

    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    # Broadcast initial variable states from rank 0 to all other procs.
    # This is necessary to ensure consistent initialization of all
    # workers when training is started with random weights or restored
    # from a checkpoint.
    # Callback when using horovod.keras as hvd
    # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    KB.get_session().run(hvd.broadcast_global_variables(0))

    if not args.use_dataset_api:
        start_time = time.time()
        # Fit the model on the batches generated by traingen.flow().
        model.fit_generator(
            traingen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=(x_test, y_test) if hvd.rank() == 0 else None,
            verbose=hvd.rank() == 0,
            callbacks=callbacks)
    else:
        # augmentation incorporated in the Dataset pipeline
        start_time = time.time()
        # Validation during training can be incorporated via callback:
        # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56
        model.fit(steps_per_epoch=steps_per_epoch,
                  epochs=epochs,
                  verbose=hvd.rank() == 0,
                  callbacks=callbacks)

    if hvd.rank() != 0:
        return

    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    test_model = model
    if args.use_dataset_api:
        # Create a test-model without Dataset pipeline in the model graph.
        test_model = make_model(x_test.shape[1:], num_classes)
        test_model.compile(loss=keras_losses.categorical_crossentropy,
                           optimizer=opt,
                           metrics=['accuracy'])
        print('SETTING WEIGHTS FOR EVAL WITH DATASET API...')
        test_model.set_weights(model.get_weights())
        print('WEIGHTS SET!!!')

    metrics = test_model.evaluate(x_test, y_test)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))
コード例 #12
0
                  epochs=epochs,
                  verbose=hvd.rank() == 0,
                  callbacks=callbacks)

    if hvd.rank() != 0:
        return

    elapsed_time = time.time() - start_time
    print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3)))

    test_model = model
    if args.use_dataset_api:
        # Create a test-model without Dataset pipeline in the model graph.
        test_model = make_model(x_test.shape[1:], num_classes)
        test_model.compile(loss=keras_losses.categorical_crossentropy,
                           optimizer=opt,
                           metrics=['accuracy'])
        print('SETTING WEIGHTS FOR EVAL WITH DATASET API...')
        test_model.set_weights(model.get_weights())
        print('WEIGHTS SET!!!')

    metrics = test_model.evaluate(x_test, y_test)
    print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))


if __name__ == '__main__':
    main()
    # join all ranks and cleanup Keras/Tensorflow session.
    hvd_keras.allreduce([0], name="Barrier")
    KB.clear_session()