initial_lr=initial_lr), hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1, initial_lr=initial_lr), hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2, initial_lr=initial_lr), hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3, initial_lr=initial_lr), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. model.fit_generator(train_iter, steps_per_epoch=len(train_iter) // hvd.size(), callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=4, initial_epoch=resume_from_epoch, validation_data=test_iter, validation_steps=3 * len(test_iter) // hvd.size()) # Evaluate the model on the full data set. score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=4)) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1])
def train_model(model, xy_train, xy_test, tensorboard_dir, data_augmentation=False, epochs=200, batch_size=32): x_train, y_train = xy_train x_test, y_test = xy_test print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), verbose=2, shuffle=True) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization=False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) height_shift_range=0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] verbose = 0 # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: checkpoint = os.path.join(OUTPUT_DIR, 'checkpoint-{epoch}.h5') callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint)) callbacks.append(keras.callbacks.TensorBoard(log_dir=tensorboard_dir)) verbose = 2 # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=x_train.shape[0] // batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, validation_data=(x_test, y_test)) # Evaluate model with test data set and share sample prediction results evaluation = hvd.allreduce(model.evaluate_generator(datagen.flow(x_test, y_test, batch_size=batch_size), steps=x_test.shape[0] // batch_size)) if hvd.rank() == 0: print('Model Accuracy = %.2f' % (evaluation[1])) riseml.report_result(accuracy=float(evaluation[1]))
def main(args): #initialize Horovod. hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) fold = args.data_path.split("fold_")[1] if hvd.rank()==0: print("================================") if args.use_lovasz: print("Fine tuning with ") print("Fold {}".format(fold)) #Find best saved model best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}') resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(best_model_file.format(epoch=try_epoch)): resume_from_epoch = try_epoch break if hvd.rank()==0: print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch))) resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') #verbose mode for one node if hvd.rank()==0: verbose = 1 else: verbose = 0 #Create dataset dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size) input_shape = (args.target_size, args.target_size) mask_shape = (101, 101) train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand()) val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand()) train_step_size = dataset.train_step_size // hvd.size() val_step_size = dataset.val_step_size // hvd.size() #Create model model = make_model(args.model, (args.target_size, args.target_size, 3), 2) #load weights if resume_from_epoch > 0: model.load_weights(best_model_file.format(epoch=resume_from_epoch)) size = hvd.size() opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True)) #Loss loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy model.compile(loss=loss, optimizer=opt, metrics=[metrics.c_binary_accuracy, metrics.c_iou]) #h5 model best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss', verbose=1, mode='min', period=1, save_best_only=True, save_weights_only=True) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True) ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) callbacks.append(best_model) #Fit model history = model.fit_generator(train_data_generator, steps_per_epoch=train_step_size, callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=4, initial_epoch=resume_from_epoch, validation_data=val_data_generator, validation_steps=val_step_size) score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4)) print('Test loss:', score[0]) print('Test accuracy:', score[1])
hvd.callbacks.LearningRateScheduleCallback(start_epoch=warmup_epochs, end_epoch=30, multiplier=1.), hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(log_dir)) # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. model.fit_generator(train_iter, steps_per_epoch=len(train_iter) // hvd.size(), callbacks=callbacks, epochs=epochs, verbose=verbose, workers=4, initial_epoch=resume_from_epoch, validation_data=test_iter, validation_steps=3 * len(test_iter) // hvd.size()) # Evaluate the model on the full data set. score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=4)) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1])
def main(): verbose = 1 logger = _get_logger() if _DISTRIBUTED: # Horovod: initialize Horovod. hvd.init() logger.info("Runnin Distributed") verbose = 1 if hvd.rank() == 0 else 0 logger.info("Tensorflow version {}".format(tf.__version__)) K.set_session(tf.Session(config=_get_runconfig())) # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = 0 if _DISTRIBUTED: resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name="resume_from_epoch") if _FAKE: train_iter = _fake_data_iterator_from() else: train_iter = _training_data_iterator_from() test_iter = _validation_data_iterator_from() if _VALIDATION else None model = _create_model() params = {"learning_rate": _LR, "momentum": 0.9} opt = _get_optimizer(params) model.compile( loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=["accuracy", "top_k_categorical_accuracy"], ) model_dir = _get_model_dir() checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5") callbacks = _get_hooks() callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE)) # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if _is_master(): callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format)) # callbacks.append(keras.callbacks.TensorBoard(log_dir)) # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast weights to other workers. if resume_from_epoch > 0 and _is_master(): model.load_weights(checkpoint_format.format(epoch=resume_from_epoch)) logger.info("Training...") # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. num_workers = hvd.size() if _DISTRIBUTED else 1 model.fit_generator( train_iter, steps_per_epoch=len(train_iter) // num_workers, callbacks=callbacks, epochs=_EPOCHS, verbose=verbose, workers=_NUM_WORKERS, max_queue_size=_MAX_QUEUE_SIZE, use_multiprocessing=_MULTIPROCESSING, initial_epoch=resume_from_epoch, ) if _FAKE is False and _VALIDATION: # Evaluate the model on the full data set. with Timer(output=logger.info, prefix="Testing"): logger.info("Testing...") score = hvd.allreduce( model.evaluate_generator(test_iter, len(test_iter), workers=10)) if verbose: print("Test loss:", score[0]) print("Test accuracy:", score[1])
# \date 2019-07-30 17:05:04.755084 # \Description nc horovodrun -np 2 python allreduce.py # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import horovod.keras as hvd import numpy as np hvd.init() hvd_r = int(hvd.rank()) assert hvd.size() == 2 #each process compute a small part of something and then compute the average etc. test_array = np.array(range(100)) #compute a small part span = int(100 / hvd.size()) #x=np.mean(test_array[hvd_r * span: (hvd_r + 1) * span]) x = test_array[hvd_r * span:(hvd_r + 1) * span] #compute the average for all processes y = hvd.allreduce(x, average=False) #only one process print out the result if (hvd_r == 0): print(y, len(y), sum(y))
def main(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.compat.v1.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Training data iterator. train_gen = image.ImageDataGenerator() #width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, #preprocessing_function=keras.applications.resnet50.preprocess_input) train_iter = train_gen.flow_from_directory(args.train, batch_size=args.batch_size, target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator() #zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(args.val, batch_size=args.val_batch_size, target_size=(224, 224)) # train iterator for tfrecord train_iter_tf = iterator(args.train_dir) val_iter_tf = iterator(args.val_dir) # timeline #timeline = tf.train.ProfilerHook(save_steps=500, output_dir='./timeline') #run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) #run_metadata = tf.compat.v1.RunMetadata() # Set up standard ResNet-50 model. model = keras.applications.resnet50.ResNet50(weights=None) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model( args.checkpoint_format.format(epoch=resume_from_epoch), compression=compression) else: # ResNet-50 model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) # Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(), momentum=args.momentum) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt, compression=compression) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy', 'top_k_categorical_accuracy']) # options=run_options, # run_metadata=run_metadata # ) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=args.warmup_epochs, verbose=verbose), # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. hvd.callbacks.LearningRateScheduleCallback( start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.), hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. print('---- train len------ :', len(train_iter)) print('---- test len------ :', len(test_iter)) total_train_step = len(train_iter) total_val_step = len(test_iter) #model.fit_generator(train_iter, model.fit( train_iter_tf, #steps_per_epoch=40037 // hvd.size(), steps_per_epoch=total_train_step // hvd.size(), callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=8, initial_epoch=resume_from_epoch, validation_data=val_iter_tf, validation_steps=3 * total_val_step // hvd.size()) # timeline tracing #trace = timeline.Timeline(step_stats=run_metadata.step_stats) #with open ('./timeline.keras.json','w') as f: # f.write(trace.generate_chrome_trace_format()) # Evaluate the model on the full data set. score = hvd.allreduce( model.evaluate_generator(test_iter, len(test_iter), workers=4)) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1])
def main(_): '''Main routine for Horovod Tensorflow Mnist example.''' # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank())) config = tf.ConfigProto(gpu_options=gpu_options) batch_size = 100 # Download and load MNIST dataset. if hvd.rank() == 0: # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR) image, label = get_data_mnist(batch_size) # hvd.allreduce(tf.constant([0]), average=False) # Barrier (not working) with tf.Session(config=config): # download/unzip in rank 0 only. hvd_keras.allreduce([0], name="Barrier") if hvd.rank() != 0: # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR) image, label = get_data_mnist(batch_size) # Build model... # with tf.name_scope('input'): # image = tf.placeholder(tf.float32, [None, 784], name='image') # label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable # states from rank 0 to all other processes. This is necessary to # ensure consistent initialization of all workers when training is # started with random weights or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when # done or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. # image_, label_ = mnist.train.next_batch(100) # mon_sess.run(train_op, feed_dict={image: image_, label: label_}) mon_sess.run(train_op)
# \date 2019-07-30 17:05:04.755084 # \Description nc horovodrun -np 2 python allreduce.py # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import horovod.keras as hvd import numpy as np hvd.init() hvd_r = int(hvd.rank()) assert hvd.size() == 2 #each process compute a small part of something and then compute the average etc. test_array = np.array(range(100)) #compute a small part span = int(100 / hvd.size()) x = np.mean(test_array[hvd_r * span:(hvd_r + 1) * span]) #x=test_array[hvd_r * span: (hvd_r + 1) * span] #compute the average for all processes y = hvd.allreduce(x) #only one process print out the result if (hvd_r == 0): print("mean of the big array is %f" % y)
def main(): parser = argparse.ArgumentParser( description='Keras Fashion MNIST Example', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--log-dir', default='./logs', help='tensorboard log directory') parser.add_argument('--batch-size', type=int, default=32, help='input batch size for training') parser.add_argument('--val-batch-size', type=int, default=32, help='input batch size for validation') parser.add_argument('--epochs', type=int, default=40, help='number of epochs to train') parser.add_argument('--base-lr', type=float, default=0.01, help='learning rate for a single GPU') parser.add_argument('--momentum', type=float, default=0.9, help='SGD momentum') parser.add_argument('--wd', type=float, default=0.000005, help='weight decay') # TODO: Step 9 part 1: register `--warmup-epochs` parser.add_argument('--warmup-epochs', type=float, default=5, help='number of warmup epochs') GRAPHDEF_FILE = 'graphdef' parser.add_argument( '--savegraph', action='store', nargs='?', const=GRAPHDEF_FILE, help='Save graphdef pb and pbtxt files. ' '(default: {})'.format(GRAPHDEF_FILE)) parser.add_argument( '--profrun', action='store_true', help='Run for nsys/dlprof profiling. Runs only a few steps.') args = parser.parse_args() # Checkpoints will be written in the log directory. args.checkpoint_format = \ os.path.join(args.log_dir, 'checkpoint-{epoch}.h5') print('AMP MIXED', os.environ.get("TF_ENABLE_AUTO_MIXED_PRECISION")) # TODO: Step 2 work here: initialize horovod hvd.init() # TODO: Step 3 work here: pin GPUs # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # TODO: Step 4 work here: broadcast `resume_from_epoch` from first process # to all others with tf.Session(config=config): resume_from_epoch = \ hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # TODO: Step 5 work here: only set `verbose` to `1` if this is the # first worker verbose = 1 if hvd.rank() == 0 else 0 # Input image dimensions img_rows, img_cols = 28, 28 num_classes = 10 # Download and load FASHION MNIST dataset. if hvd.rank() == 0: # Load Fashion MNIST data. (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() with tf.Session(config=config): # download/unzip in rank 0 only. hvd.allreduce([0], name="Barrier") if hvd.rank() != 0: # Load Fashion MNIST data. (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) # Convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # Training data iterator. train_gen = image.ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True, horizontal_flip=True, width_shift_range=0.2, height_shift_range=0.2) train_gen.fit(x_train) train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size) # Validation data iterator. test_gen = image.ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True) test_gen.mean = train_gen.mean test_gen.std = train_gen.std test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size) base_lr = args.base_lr LR = base_lr * hvd.size() # Restore from a previous checkpoint, if initial_epoch is specified. # if resume_from_epoch > 0 and hvd.rank() == 0: if resume_from_epoch > 0: # TODO: Step 6 work here: only execute the `if` statement if this is # the first worker # If this is only done in rank 0 get following errors: # horovod/common/operations.cc:764] One or more tensors were # submitted to be reduced, gathered or broadcasted by subset of # ranks and are waiting for remainder of ranks model = keras.models.load_model( args.checkpoint_format.format(epoch=resume_from_epoch)) else: # Set up standard WideResNet-16-10 model. model = WideResidualNetwork( depth=16, width=10, weights=None, input_shape=input_shape, classes=num_classes, dropout_rate=0.01) # WideResNet model that is included with Keras is optimized for # inference. Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) # TODO: Step 7 part 1 work here: increase the base learning rate by the # number of workers opt = keras.optimizers.SGD( lr=LR, momentum=args.momentum) # TODO: Step 7 part 2 work here: Wrap the optimizer in a Horovod # distributed optimizer opt_dist = hvd.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt_dist, metrics=['accuracy']) def lr_schedule(epoch): # global LR if epoch < 15: return LR if epoch < 25: return 1e-1 * LR if epoch < 35: return 1e-2 * LR return 1e-3 * LR warmup_epochs = args.warmup_epochs callbacks = [ # TODO: Step 8: broadcast initial variable states from the first # worker to all others hvd.callbacks.BroadcastGlobalVariablesCallback(0), # TODO: Step 12: average the metrics among workers at the end of every # epoch hvd.callbacks.MetricAverageCallback(), # TODO: Step 9 part 2: implement a LR warmup over `args.warmup_epochs` hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=warmup_epochs, verbose=verbose), # TODO: Step 9 part 3: replace with the Horovod learning rate # scheduler, taking care not to start until after warmup is complete hvd.callbacks.LearningRateScheduleCallback( lr_schedule, start_epoch=warmup_epochs) ] if hvd.rank() == 0: # TODO: Step 10: only append these 2 callbacks to `callbacks` if they # are to be executed by the first worker callbacks.append( keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) # Train the model. number_of_workers = hvd.size() steps_per_epoch = len(train_iter) // number_of_workers validation_steps = 3 * len(test_iter) // number_of_workers # Train the model. if args.profrun: steps_per_epoch = 4 model.fit_generator(train_iter, # TODO: Step 11 part 1: keep the total number of steps # the same in spite of an increased number of workers steps_per_epoch=steps_per_epoch, callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=number_of_workers, initial_epoch=resume_from_epoch, validation_data=test_iter, # TODO: Step 11 part 2: Set this value to be # 3 * num_test_iterations / number_of_workers validation_steps=validation_steps) # Evaluate the model on the full data set. score = model.evaluate_generator(test_iter, len(test_iter), workers=number_of_workers) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1]) if hvd.rank() == 0 and args.savegraph: graphdef_file = args.savegraph session = K.get_session() graph_def = session.graph.as_graph_def() with open('{}.pb'.format(graphdef_file), 'wb') as f: f.write(graph_def.SerializeToString()) with open('{}.pbtxt'.format(graphdef_file), 'w') as f: f.write(str(graph_def))
def main(argv=None): '''Train a simple deep CNN on the CIFAR10 small images dataset on multigpu (and optionally multinode+multigpu) systems via Horovod implementation. ''' argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # CLI parser # args = parser_(argv[1:], desc) args = parser_(desc) # Initialize Horovod. hvd.init() logdevp = args.logdevp # For debugging log_device_placement, allow_soft_placement = (True, True) \ if _DEVPROF or logdevp else (False, False) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank, gpu_local_rank)) # Pin GPU to local rank. Typically one GPU per process unless # oversubscribing GPUs (experimental MPS). In model parallelism it's # possible to have multiple GPUs per process. # visible_device_list = str(hvd.local_rank() gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=str(gpu_local_rank)) config = tf.ConfigProto(log_device_placement=log_device_placement, allow_soft_placement=allow_soft_placement, gpu_options=gpu_options) KB.set_session(tf.Session(config=config)) hvdsize = hvd.size() checkpt = args.checkpt filepath = checkpt batch_size = args.batch_size num_classes = 10 epochs = args.epochs datadir = args.datadir # The data, shuffled and split between train and test sets: if hvd.rank() == 0: # download only in rank0 i.e. single process (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) hvd_keras.allreduce([0], name="Barrier") if hvd.rank() != 0: # Data should be downloaded already so load in the other ranks. (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) train_samples = x_train.shape[0] test_samples = x_test.shape[0] steps_per_epoch = train_samples // batch_size // hvdsize print_rank0('{} train samples'.format(train_samples), hvd) print_rank0('{} test samples'.format(test_samples), hvd) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) if not args.use_dataset_api: traingen = ImageDataGenerator() if args.aug: print_rank0('Using real-time data augmentation.', hvd) # This will do preprocessing and realtime data augmentation: traingen = ImageDataGenerator( # set input mean to 0 over the dataset featurewise_center=False, # set each sample mean to 0 samplewise_center=False, # divide inputs by std of the dataset featurewise_std_normalization=False, # divide each input by its std samplewise_std_normalization=False, # apply ZCA whitening zca_whitening=False, # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, # randomly flip images horizontal_flip=True, # randomly flip images vertical_flip=False) # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied) traingen.fit(x_train) model = make_model(x_train.shape[1:], num_classes, filepath) else: print_rank0('USING TF DATASET API.', hvd) dataset = wrap_as_tfdataset(x_train, y_train, args.aug, batch_size, gpu_local_rank, prefetch_to_device=True, comm=hvd_keras) iterator = dataset.make_one_shot_iterator() # Model creation using tensors from the get_next() graph node. inputs, targets = iterator.get_next() x_train_input = KL.Input(tensor=inputs) model_init = make_model(x_train_input, num_classes, filepath) x_train_out = model_init.output model = Model(inputs=[x_train_input], outputs=[x_train_out]) # Let's train the model using RMSprop lr = 0.0001 * hvdsize # opt = KO.RMSprop(lr=lr, decay=1e-6) # opt = hvd_keras.DistributedOptimizer(opt) opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) model.compile( loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'], target_tensors=None if not args.use_dataset_api else [targets]) if hvd.rank() == 0: model.summary() callbacks = [] if checkpt and hvd.rank() == 0: checkpoint = ModelCheckpoint(filepath, monitor='loss', mode='min', verbose=1, save_best_only=True) callbacks.append(checkpoint) if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] # Broadcast initial variable states from rank 0 to all other procs. # This is necessary to ensure consistent initialization of all # workers when training is started with random weights or restored # from a checkpoint. # Callback when using horovod.keras as hvd # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) KB.get_session().run(hvd.broadcast_global_variables(0)) if not args.use_dataset_api: start_time = time.time() # Fit the model on the batches generated by traingen.flow(). model.fit_generator( traingen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test) if hvd.rank() == 0 else None, verbose=hvd.rank() == 0, callbacks=callbacks) else: # augmentation incorporated in the Dataset pipeline start_time = time.time() # Validation during training can be incorporated via callback: # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56 model.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, verbose=hvd.rank() == 0, callbacks=callbacks) if hvd.rank() != 0: return elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) test_model = model if args.use_dataset_api: # Create a test-model without Dataset pipeline in the model graph. test_model = make_model(x_test.shape[1:], num_classes) test_model.compile(loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) print('SETTING WEIGHTS FOR EVAL WITH DATASET API...') test_model.set_weights(model.get_weights()) print('WEIGHTS SET!!!') metrics = test_model.evaluate(x_test, y_test) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))
epochs=epochs, verbose=hvd.rank() == 0, callbacks=callbacks) if hvd.rank() != 0: return elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) test_model = model if args.use_dataset_api: # Create a test-model without Dataset pipeline in the model graph. test_model = make_model(x_test.shape[1:], num_classes) test_model.compile(loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) print('SETTING WEIGHTS FOR EVAL WITH DATASET API...') test_model.set_weights(model.get_weights()) print('WEIGHTS SET!!!') metrics = test_model.evaluate(x_test, y_test) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) if __name__ == '__main__': main() # join all ranks and cleanup Keras/Tensorflow session. hvd_keras.allreduce([0], name="Barrier") KB.clear_session()