def main(argv=None): # Initialize Horovod. hvd.init() # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) KB.set_session(tf.Session(config=config)) # print('LOCAL RANK, OVERAL RANK: {}, {}'.format(hvd.local_rank(), # hvd.rank())) ngpus = hvd.size() main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = _parser(desc) num_devices_tfrecord = 1 height, width = 224, 224 # Image dimensions. Gets resized if not match. distort_color = args.distort_color data_dir = args.datadir batch_size = args.batch_size # * ngpus epochs = args.epochs imgs_per_epoch = args.imgs_per_epoch # Fit the model using data from the TFRecord data tensors. device_minibatches = RecordInputImagenetPreprocessor.device_minibatches images_tfrecord, labels_tfrecord, nrecords = device_minibatches( num_devices_tfrecord, data_dir, batch_size, height, width, distort_color, val=False) images_tfrecord = images_tfrecord[0] labels_tfrecord = labels_tfrecord[0] # CASTING FOR KERAS # labels[device_num] = tf.cast(labels_tfrecord, dtype) nclasses = 1000 labels_tfrecord = tf.one_hot(labels_tfrecord, nclasses) nimgs_to_use = imgs_per_epoch if imgs_per_epoch > 0 else nrecords steps_per_epoch = nimgs_to_use // batch_size // hvd.size() # steps_per_epoch = 100 # batch_shape = images_tfrecord.get_shape().as_list() # images = Input(tensor=images_tfrecord, batch_shape=x_batch_shape) images = Input(tensor=images_tfrecord) model = ResNet50(input_tensor=images, weights=None) if hvd.rank() == 0: model.summary() print('Num images: {}'.format(nrecords)) if nimgs_to_use < nrecords: print('Using {} images per epoch'.format(nimgs_to_use)) # print('IMAGES_TFRECORD: {}'.format(images_tfrecord)) # print('LABELS_TFRECORD: {}'.format(labels_tfrecord)) # Add Horovod Distributed Optimizer from nvcnn.py # momentum = 0.9 # lr = 0.1 # learning_rate = tf.train.exponential_decay( # lr, # self.global_step, # decay_steps=FLAGS.lr_decay_epochs * nstep_per_epoch, # decay_rate=FLAGS.lr_decay_rate, # staircase=True) # opt = tf.train.MomentumOptimizer(self.learning_rate, momentum, # use_nesterov=True) # lr = 0.001 * ngpus # opt = tf.train.AdamOptimizer() # opt = hvd.DistributedOptimizer(opt) # , use_locking=True) # opt = KO.TFOptimizer(opt) # Required for tf.train based optimizers opt = KO.Adam() opt = hvd_keras.DistributedOptimizer(opt) model.compile( loss='categorical_crossentropy', optimizer=opt, # metrics=['accuracy'], target_tensors=[labels_tfrecord]) # Broadcast variables from rank 0 to all other processes. KB.get_session().run(hvd.broadcast_global_variables(0)) callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(ngpus * batch_size)] # RecordInput is a yield op which doesn't use queue runners or queues. # Start the queue runners. # sess = KB.get_session() # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess, coord) start_time = time.time() model.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, verbose=1) # verbose=hvd.rank() == 0) elapsed_time = time.time() - start_time if hvd.rank() == 0: print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) # loss = model.evaluate(None, None, steps=steps_per_epoch_val) images_tfrecord_val, labels_tfrecord_val, nrecords_val = \ device_minibatches(num_devices_tfrecord, data_dir, batch_size, height, width, distort_color, val=True) images_tfrecord_val = images_tfrecord_val[0] labels_tfrecord_val = labels_tfrecord_val[0] labels_tfrecord_val = tf.one_hot(labels_tfrecord_val, nclasses) # print('IMAGES_TFRECORD_VAL: {}'.format(images_tfrecord_val)) # print('labels_tfrecord_val: {}'.format(labels_tfrecord_val)) steps_per_epoch_val = nrecords_val // batch_size images_val = Input(tensor=images_tfrecord_val) model_val = model model_val.layers[0] = KL.InputLayer(input_tensor=images_val) model_val.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[labels_tfrecord_val]) # model.summary() loss = model_val.evaluate(x=None, y=None, steps=steps_per_epoch_val) print('\nNum images evaluated, steps: {}, {}'.format( nrecords_val, steps_per_epoch_val)) print('\nTest loss, acc: {}'.format(loss)) # print('\nTest accuracy: {0}'.format(acc)) # Clean up the TF session. # coord.request_stop() # coord.join(threads) KB.clear_session() # do this for Horovod
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] # steps_per_epoch = train_samples // batch_size // hvdsize speedupopt = args.speedup if speedupopt == SpeedupOpts.imgspersec: steps_per_epoch = train_samples // batch_size else: steps_per_epoch = int( round(float(train_samples) / batch_size / hvdsize + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict(img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae = make_vae(ldict, x) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # opt = RMSprop(lr) # Add Horovod Distributed Optimizer. # opt = hvd_keras.DistributedOptimizer(opt) # , use_locking=True) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) if hvd.rank() == 0: x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim, )) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu enqueue = args.enqueue usenccl = args.nccl syncopt = args.syncopt checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = 32 num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: # (x_train, y_train), (x_test, y_test) = cifar10.load_data() (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which # was fixed in Keras 2.1.1 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes).squeeze() y_test = to_categorical(y_test, num_classes).squeeze() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 callbacks = [] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) print(x_train.shape, 'train shape') # with tf.device('/cpu:0'): model_init = make_model(x_train.shape, num_classes, filepath if checkpt_flag else None) # model_init = partial(make_model, x_train.shape, num_classes, # filepath if checkpt_flag else None) if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] lr = 0.0001 if mgpu > 1 or mgpu == -1: gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) print('Using GPUs: {}'.format(', '.join(gpus_list))) batch_size = batch_size * ngpus # lr = lr * ngpus # batch_size = 40000 # split over four devices works fine no grad avg # batch_size = 25000 # split over four devices works fine w/ grad avg # Data-Parallelize the model via function or class. model = make_parallel(model_init, gpus_list, usenccl=usenccl, syncopt=syncopt, enqueue=enqueue) # model = ModelMGPU(serial_model=model_init, gdev_list=gpus_list, # syncopt=syncopt, usenccl=usenccl, enqueue=enqueue) print_mgpu_modelsummary(model) if not syncopt: opt = RMSprop(lr=lr, decay=1e-6) else: opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list) else: model = model_init # batch_size = batch_size * 3 # batch_size = 25000 # exhaust GPU memory. Crashes. print(model.summary()) # initiate RMSprop optimizer opt = RMSprop(lr=lr, decay=1e-6) callbacks += [BatchTiming(), SamplesPerSec(batch_size)] # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks) # Fit the model on the batches generated by datagen.flow(). # mygen = mygenerator(nsamples, batch_size, x_train, y_train) # model.fit_generator(mygen, # steps_per_epoch=steps_per_epoch, # epochs=epochs, # validation_data=(x_test, y_test), # callbacks=callbacks) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 # divide inputs by std of the dataset featurewise_std_normalization=False, samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) model_init.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) metrics = model_init.evaluate(x=x_test, y=y_test, batch_size=batch_size) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 0 if getattr(args, 'mgpu', None) is None else args.mgpu checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) gdev_list = get_available_gpus(mgpu or 1) ngpus = len(gdev_list) batch_size_1gpu = 32 batch_size = batch_size_1gpu * ngpus num_classes = 10 epochs = args.epochs data_augmentation = args.aug logdevp = args.logdevp datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = y_test.shape[0] steps_per_epoch = train_samples // batch_size # validations_steps = test_samples // batch_size print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Squeeze is to deal with to_categorical bug in Keras 2.1.0 which # was fixed in Keras 2.1.1 y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze() y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze() # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. # min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. # enqueue_many = True # Force input pipeline to CPU:0 to avoid data operations ending up on GPU # and resulting in a slow down for multigpu case due to comm overhead. with tf.device('/cpu:0'): # if no augmentation can go directly from numpy arrays # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[x_train, y_train], # # tensors=[x_train, y_train.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # enqueue_many=enqueue_many, # num_threads=8) input_images = tf.constant(x_train.reshape(train_samples, -1)) input_labels = tf.constant(y_train) # already in proper shape image, label = tf.train.slice_input_producer( [input_images, input_labels], shuffle=True) # If using num_epochs=epochs have to: # sess.run(tf.local_variables_initializer()) # and maybe also: sess.run(tf.global_variables_initializer()) image = tf.reshape(image, x_train.shape[1:]) test_images = tf.constant(x_test.reshape(test_samples, -1)) test_image, test_label = tf.train.slice_input_producer( [test_images, y_test], shuffle=False) test_image = tf.reshape(test_image, x_train.shape[1:]) if data_augmentation: print('Using real-time data augmentation.') # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(image) # Because these operations are not commutative, consider # randomizing the order their operation. # NOTE: since per_image_standardization zeros the mean and # makes the stddev unit, this likely has no effect see # tensorflow#1458. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the # pixels. image = tf.image.per_image_standardization(distorted_image) # Do this for testing as well if standardizing test_image = tf.image.per_image_standardization(test_image) # Use tf.train.batch if slice_input_producer shuffle=True, # otherwise use tf.train.shuffle_batch. Not sure which way is faster. x_train_batch, y_train_batch = tf.train.batch([image, label], batch_size=batch_size, capacity=capacity, num_threads=8) # https://stackoverflow.com/a/43613376/3457624 x_test_batch, y_test_batch = tf.train.batch( [test_image, test_label], batch_size=test_samples, # if converting to numpy first # batch_size=batch_size, # if using tensors capacity=capacity, # num_threads=8, num_threads=1, # set to 1 to make deterministic name='test_batch', shared_name='test_batch') x_train_input = KL.Input(tensor=x_train_batch) callbacks = [] if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # config.gpu_options.allow_growth = True tfsess = tf.Session(config=config) KB.set_session(tfsess) model_init = make_model(x_train_input, num_classes, filepath if checkpt_flag else None) x_train_out = model_init.output # model_init.summary() model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) lr = 0.0001 * ngpus if ngpus > 1: model = make_parallel(model_init, gdev_list) else: # Must re-instantiate model per API below otherwise doesn't work. model = model_init opt = RMSprop(lr=lr, decay=1e-6) # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[y_train_batch]) print_mgpu_modelsummary(model) # will print non-mgpu model as well if checkpt_flag: checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True) callbacks += [checkpoint] callbacks += [BatchTiming(), SamplesPerSec(batch_size)] # Start the queue runners. sess = KB.get_session() # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) val_in_train = False # not sure how the validation part works during fit. start_time = time.time() model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validation_steps=validations_steps if val_in_train else None, validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) weights_file = checkptfile # './saved_cifar10_wt.h5' if not checkpt_flag: # empty list model.save_weights(checkptfile) KB.clear_session() # Second Session. Demonstrate that the model works # test_model = make_model(x_test.shape[1:], num_classes, # weights_file=weights_file) test_model = make_model(x_test.shape[1:], num_classes) test_model.load_weights(weights_file) test_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if data_augmentation: # Need to run x_test through per_image_standardization otherwise # results get messed up. x_processed, y_processed = sess.run([x_test_batch, y_test_batch]) # DEBUGGING # xdiff = np.abs(x_test - x_processed) # print('MAX XDIFF: {}'.format(np.max(xdiff))) # ydiff = np.abs(y_test - y_processed) # print('y_test: {}'.format(y_test[0:5, :])) # print('y_processed: {}'.format(y_processed[0:5, :])) # print('ydiff: {}'.format(ydiff[-10:, :])) # print('MAX YDIFF: {}'.format(np.max(np.sum(ydiff)))) loss, acc = test_model.evaluate(x_processed, y_processed) else: loss, acc = test_model.evaluate(x_test, y_test) # # Demonstrate that the model works using TF pipeline directly. # # In tf.train.batch for test data change batch_size=batch_size # # instead of train_samples. Uncomment below and comment out above. # val_samples = x_test.shape[0] # steps_per_epoch_val = int(np.ceil(val_samples / float(batch_size))) # images_val = KL.Input(tensor=x_test_batch) # test_model = make_model(images_val, num_classes, # weights_file) # test_model = Model(inputs=[images_val], outputs=[test_model.output]) # test_model.compile( # loss='categorical_crossentropy', # optimizer=opt, # metrics=['accuracy'], # target_tensors=[y_test_batch]) # loss, acc = test_model.evaluate(x=None, y=None, # steps=steps_per_epoch_val) print('\nTest loss: {0}'.format(loss)) print('\nTest accuracy: {0}'.format(acc)) # Clean up the TF session. coord.request_stop() coord.join(threads)
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) K.set_session(tf.Session(config=config)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 hvdsize = hvd.size() batch_size = 128 # 100 if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() # Data split if going for reduction in each iteration step. Using # tf-queue or dataset is better to preserve uniform random sampling. # nsamples = x_train.shape[0] # mysamples = nsamples // hvdsize # start_sam = hvd.local_rank() * mysamples # stop_sam = min((hvd.local_rank() + 1) * mysamples, nsamples) # x_train = x_train[start_sam:stop_sam, ...] x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) if hvd.rank() == 0: print('x_train.shape:', x_train.shape) vae, encoder, generator = make_vae_and_codec(original_img_size, img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # : :type vae: Model lr = 0.001 # * hvdsize opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) if hvd.rank() == 0: vae.summary() callbacks = [] if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] sess = K.get_session() sess.run(hvd.broadcast_global_variables(0)) vae.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, validation_data=(x_test, None), callbacks=callbacks) if hvd.rank() == 0: vae_val = vae loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed # through the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close() K.clear_session()
def main(argv=None): '''Multigpu example using Keras for Cifar10 training.''' argv = sys.argv if argv is None else sys.argv.extend(argv) # CLI parser args = parser_(main.__doc__) logdevp = args.logdevp gpu_options = tf.GPUOptions(allow_growth=True) if _DEVPROF or logdevp: # or True: # Setup Keras session using Tensorflow config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options) # config.gpu_options.allow_growth = True KB.set_session(tf.Session(config=config)) else: config = tf.ConfigProto(gpu_options=gpu_options) KB.set_session(tf.Session(config=config)) mgpu = 0 if args.mgpu is None else args.mgpu gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) syncopt = args.syncopt checkpt = args.checkpt filepath = checkpt # print('CHECKPT:', checkpt) batch_size = args.batch_size * ngpus if ngpus > 1 else args.batch_size num_classes = 10 epochs = args.epochs datadir = args.datadir # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) if not args.use_dataset_api: traingen = ImageDataGenerator() if args.aug: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: traingen = ImageDataGenerator( # set input mean to 0 over the dataset featurewise_center=False, # set each sample mean to 0 samplewise_center=False, # divide inputs by std of the dataset featurewise_std_normalization=False, # divide each input by its std samplewise_std_normalization=False, # apply ZCA whitening zca_whitening=False, # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, # randomly flip images horizontal_flip=True, # randomly flip images vertical_flip=False) # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied) traingen.fit(x_train) # x_train_input = KL.Input(shape=x_train.shape[1:]) model_init = make_model( x_train.shape[1:], num_classes, filepath) else: print('USING TF DATASET API.') dataset = wrap_as_tfdataset( x_train, y_train, args.aug, batch_size) iterator = dataset.make_one_shot_iterator() # Model creation using tensors from the get_next() graph node. inputs, targets = iterator.get_next() x_train_input = KL.Input(tensor=inputs) model_init_ = make_model(x_train_input, num_classes, filepath) x_train_out = model_init_.output model_init = Model(inputs=[x_train_input], outputs=[x_train_out]) lr = 0.0001 if ngpus > 1: print('Using GPUs: {}'.format(', '.join(gpus_list))) lr = lr * ngpus # Data-Parallelize the model via function or class. if args.mgpu_type == 'kerasmgpu': gpus_list_int = get_available_gpus( ngpus, list_type=GPUListType.int_id) model = ModelKerasMGPU(model_init, gpus_list_int) else: model = ModelMGPU( serial_model=model_init, gdev_list=gpus_list) print_mgpu_modelsummary(model) if not syncopt: opt = RMSprop(lr=lr, decay=1e-6) else: opt = RMSPropMGPU(lr=lr, decay=1e-6, gdev_list=gpus_list) # @IgnorePep8 pylint: disable=unexpected-keyword-arg else: model = model_init # batch_size = batch_size * 3 # batch_size = 25000 # exhaust GPU memory. Crashes. print(model.summary()) # initiate RMSprop optimizer opt = RMSprop(lr=lr, decay=1e-6) model.compile( loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'], target_tensors=None if not args.use_dataset_api else [targets]) callbacks = [] if checkpt: checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [checkpoint] callbacks += [BatchTiming(), SamplesPerSec(batch_size)] nsamples = x_train.shape[0] steps_per_epoch = nsamples // batch_size if not args.use_dataset_api: start_time = time.time() # Fit the model on the batches generated by traingen.flow(). model.fit_generator( traingen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test), callbacks=callbacks) else: # augmentation incorporated in the Dataset pipeline start_time = time.time() # Validation during training can be incorporated via callback: # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56 model.fit( steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) elapsed_time = time.time() - start_time print('[{}] finished in {} s' .format('TRAINING', round(elapsed_time, 3))) test_model = model_init if args.use_dataset_api: # Create a test-model without Dataset pipeline in the model graph. test_model = make_model(x_test.shape[1:], num_classes) print('SETTING WEIGHTS FOR EVAL WITH DATASET API...') test_model.set_weights(model.get_weights()) print('WEIGHTS SET!!!') test_model.compile( loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) metrics = test_model.evaluate(x_test, y_test) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) KB.clear_session()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) batch_size = 128 * ngpus if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) print('x_train.shape:', x_train.shape) train_samples = x_train.shape[0] steps_per_epoch = int(round(float(train_samples) / batch_size + 0.5)) # Create the dataset and its associated one-shot iterator. buffer_size = 10000 dataset = Dataset.from_tensor_slices(x_train) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() x_train_batch = iterator.get_next() ldict = make_shared_layers_dict(img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # ldict is a dictionary that holds all layers. Since these layers are # instantiated once, they are shared amongs vae, encoder, and generator. x = Input(tensor=x_train_batch) vae_serial = make_vae(ldict, x) # : :type vae: Model vae = make_parallel(vae_serial, gpus_list) lr = 0.001 * ngpus opt = RMSprop(lr) # 'rmsprop' # opt = tf.train.RMSPropOptimizer(lr) # opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) # vae.summary() print_mgpu_modelsummary(vae) callbacks = [BatchTiming(), SamplesPerSec(batch_size)] # Fit the model using data from the TF data tensors. vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks) x = Input(shape=original_img_size) vae_val = make_vae(ldict, x) vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) x = Input(shape=original_img_size) z_mean, _ = get_encoded(ldict, x) encoder = Model(x, z_mean) # : :type encoder: Model decoder_input = Input(shape=(latent_dim, )) x_decoded_mean_squash = get_decoded(ldict, decoder_input) generator = Model(decoder_input, x_decoded_mean_squash) # : :type generator: Model # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed through # the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) mgpu = 1 if getattr(args, 'mgpu', None) is None else args.mgpu # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 # number of convolutional filters to use filters = 64 # convolution kernel size num_conv = 3 gpus_list = get_available_gpus(mgpu) ngpus = len(gpus_list) batch_size = 128 * ngpus if K.image_data_format() == 'channels_first': original_img_size = (img_chns, img_rows, img_cols) else: original_img_size = (img_rows, img_cols, img_chns) latent_dim = 2 intermediate_dim = 128 epsilon_std = 1.0 epochs = args.epochs # 5 # train the VAE on MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0], ) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0], ) + original_img_size) print('x_train.shape:', x_train.shape) vae_serial, encoder, generator = make_vae_and_codec( original_img_size, img_chns, img_rows, img_cols, batch_size, filters, num_conv, intermediate_dim, latent_dim, epsilon_std) # : :type vae: Model vae = make_parallel(vae_serial, gpus_list) lr = 0.001 * ngpus opt = RMSprop(lr) # 'rmsprop' # opt = tf.train.RMSPropOptimizer(lr) # opt = TFOptimizer(opt) vae.compile(optimizer=opt, loss=None) # vae.summary() print_mgpu_modelsummary(vae) callbacks = [BatchTiming(), SamplesPerSec(batch_size)] vae.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, callbacks=callbacks) # , # validation_data=(x_test, None)) # Not accurate for mgpu. Use vae_val. vae_val = vae_serial vae_val.compile(optimizer=opt, loss=None) loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size // ngpus) print('\n\nVAE VALIDATION LOSS: {}'.format(loss)) # display a 2D plot of the digit classes in the latent space x_test_encoded = encoder.predict(x_test, batch_size=batch_size) plt.figure(figsize=(6, 6)) plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test) plt.colorbar() # plt.show() plt.savefig('vae_scatter.ps') plt.close() # display a 2D manifold of the digits n = 15 # figure with 15x15 digits digit_size = 28 figure = np.zeros((digit_size * n, digit_size * n)) # Linearly spaced coordinates on the unit square were transformed through # the inverse CDF (ppf) of the Gaussian # To produce values of the latent variables z, since the prior of the # latent space is Gaussian grid_x = norm.ppf(np.linspace(0.05, 0.95, n)) grid_y = norm.ppf(np.linspace(0.05, 0.95, n)) for i, yi in enumerate(grid_x): for j, xi in enumerate(grid_y): z_sample = np.array([[xi, yi]]) z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2) x_decoded = generator.predict(z_sample, batch_size=batch_size) digit = x_decoded[0].reshape(digit_size, digit_size) figure[i * digit_size:(i + 1) * digit_size, j * digit_size:(j + 1) * digit_size] = digit plt.figure(figsize=(10, 10)) plt.imshow(figure, cmap='Greys_r') # plt.show() plt.savefig('vae_digit.ps') plt.close()
def main(argv=None): ''' ''' main.__doc__ = __doc__ argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # .format(os.path.basename(__file__)) # CLI parser args = parser_(desc) # Initialize Horovod. hvd.init() logdevp = args.logdevp # For debugging log_device_placement, allow_soft_placement = (True, True) \ if _DEVPROF or logdevp else (False, False) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank, gpu_local_rank)) # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto(log_device_placement=log_device_placement, allow_soft_placement=allow_soft_placement) config.gpu_options.allow_growth = True # config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.visible_device_list = str(gpu_local_rank) KB.set_session(tf.Session(config=config)) hvdsize = hvd.size() checkpt = getattr(args, 'checkpt', None) checkpt_flag = False if checkpt is None else True filepath = checkpt # print('CHECKPT:', checkpt) batch_size = args.batch_size num_classes = 10 epochs = args.epochs data_augmentation = args.aug datadir = getattr(args, 'datadir', None) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \ if datadir is not None else cifar10.load_data() train_samples = x_train.shape[0] test_samples = x_test.shape[0] steps_per_epoch = train_samples // batch_size // hvdsize # validations_steps = test_samples // batch_size print(train_samples, 'train samples') print(test_samples, 'test samples') x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes).astype(np.float32).squeeze() y_test = to_categorical(y_test, num_classes).astype(np.float32).squeeze() # The capacity variable controls the maximum queue size # allowed when prefetching data for training. capacity = 10000 # min_after_dequeue is the minimum number elements in the queue # after a dequeue, which ensures sufficient mixing of elements. # min_after_dequeue = 3000 # If `enqueue_many` is `False`, `tensors` is assumed to represent a # single example. An input tensor with shape `[x, y, z]` will be output # as a tensor with shape `[batch_size, x, y, z]`. # # If `enqueue_many` is `True`, `tensors` is assumed to represent a # batch of examples, where the first dimension is indexed by example, # and all members of `tensors` should have the same size in the # first dimension. If an input tensor has shape `[*, x, y, z]`, the # output will have shape `[batch_size, x, y, z]`. # enqueue_many = True # Force input pipeline to CPU:0 to avoid data operations ending up on GPU # and resulting in a slow down for multigpu case due to comm overhead. with tf.device('/cpu:0'): # if no augmentation can go directly from numpy arrays # x_train_batch, y_train_batch = tf.train.shuffle_batch( # tensors=[x_train, y_train], # # tensors=[x_train, y_train.astype(np.int32)], # batch_size=batch_size, # capacity=capacity, # min_after_dequeue=min_after_dequeue, # enqueue_many=enqueue_many, # num_threads=8) input_images = tf.constant(x_train.reshape(train_samples, -1)) input_labels = tf.constant(y_train) # already in proper shape image, label = tf.train.slice_input_producer( [input_images, input_labels], shuffle=True) # If using num_epochs=epochs have to: # sess.run(tf.local_variables_initializer()) # and maybe also: sess.run(tf.global_variables_initializer()) image = tf.reshape(image, x_train.shape[1:]) # label = tf.one_hot(label, num_classes) test_images = tf.constant(x_test.reshape(test_samples, -1)) test_labels = tf.constant(y_test) # already in proper shape test_image, test_label = tf.train.slice_input_producer( [test_images, test_labels], shuffle=False) test_image = tf.reshape(test_image, x_train.shape[1:]) if data_augmentation: print('Using real-time data augmentation.') # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(image) # Because these operations are not commutative, consider # randomizing the order their operation. # NOTE: since per_image_standardization zeros the mean and # makes the stddev unit, this likely has no effect see # tensorflow#1458. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the # pixels. image = tf.image.per_image_standardization(distorted_image) # Do this for testing as well if standardizing test_image = tf.image.per_image_standardization(test_image) # Use tf.train.batch if slice_input_producer shuffle=True, # otherwise use tf.train.shuffle_batch. Not sure which way is faster. x_train_batch, y_train_batch = tf.train.batch([image, label], batch_size=batch_size, capacity=capacity, num_threads=8) x_test_batch, y_test_batch = tf.train.batch([test_image, test_label], batch_size=test_samples, capacity=capacity, num_threads=1, name='test_batch', shared_name='test_batch') x_train_input = KL.Input(tensor=x_train_batch) callbacks = [] model_init = make_model(x_train_input, num_classes, filepath if checkpt_flag else None) x_train_out = model_init.output # model_init.summary() model = Model(inputs=[x_train_input], outputs=[x_train_out]) lr = 0.0001 * hvdsize # opt = RMSprop(lr=lr, decay=1e-6) # opt = hvd_keras.DistributedOptimizer(opt) # , use_locking=True) opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # , use_locking=True) opt = TFOptimizer(opt) # Required for tf.train based optimizers # ------------------------------------- HAVE TO GET SESSION AFTER OPTIMIZER sess = KB.get_session() # RUN BROADCAST_GLOBAL_VARIABLES # ------------------------------------------------------------------------- # Let's train the model using RMSprop model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], target_tensors=[y_train_batch]) if hvd.rank() == 0: model.summary() # Broadcast initial variable states from rank 0 to all other procs. # This is necessary to ensure consistent initialization of all # workers when training is started with random weights or restored # from a checkpoint. # Callback when using horovod.keras as hvd # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) sess.run(hvd.broadcast_global_variables(0)) # horovod.tensorflow as hvd if checkpt_flag and hvd.rank() == 0: checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True) callbacks.append(checkpoint) if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] # Start the queue runners. # sess.run([tf.local_variables_initializer(), # tf.global_variables_initializer()]) # Fit the model using data from the TFRecord data tensors. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) val_in_train = False # not sure how the validation part works during fit. start_time = time.time() model.fit( # validation_data=(x_test_batch, y_test_batch) # if val_in_train else None, # validation data is not used??? # validation_steps=validations_steps if val_in_train else None, validation_steps=val_in_train, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, verbose=hvd.rank() == 0) elapsed_time = time.time() - start_time if hvd.rank() == 0: print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) weights_file = checkptfile # './saved_cifar10_wt.h5' if not checkpt_flag and hvd.rank() == 0: model.save_weights(checkptfile) # KB.clear_session() # don't clear session just yet. if hvd.rank() == 0: # Second Session. Demonstrate that the model works # test_model = make_model(x_test.shape[1:], num_classes, # weights_file=weights_file) test_model = make_model(x_test.shape[1:], num_classes) test_model.load_weights(weights_file) test_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) if data_augmentation: x_processed, y_processed = sess.run([x_test_batch, y_test_batch]) metrics = test_model.evaluate(x_processed, y_processed) else: metrics = test_model.evaluate(x_test, y_test) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics)) # Clean up the TF session. coord.request_stop() coord.join(threads) KB.clear_session()
def main(argv=None): '''Train a simple deep CNN on the CIFAR10 small images dataset on multigpu (and optionally multinode+multigpu) systems via Horovod implementation. ''' argv = sys.argv if argv is None else sys.argv.extend(argv) desc = main.__doc__ # CLI parser # args = parser_(argv[1:], desc) args = parser_(desc) # Initialize Horovod. hvd.init() logdevp = args.logdevp # For debugging log_device_placement, allow_soft_placement = (True, True) \ if _DEVPROF or logdevp else (False, False) nranks_per_gpu = args.nranks_per_gpu local_rank = hvd.local_rank() gpu_local_rank = local_rank // nranks_per_gpu print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(local_rank, gpu_local_rank)) # Pin GPU to local rank. Typically one GPU per process unless # oversubscribing GPUs (experimental MPS). In model parallelism it's # possible to have multiple GPUs per process. # visible_device_list = str(hvd.local_rank() gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=str(gpu_local_rank)) config = tf.ConfigProto(log_device_placement=log_device_placement, allow_soft_placement=allow_soft_placement, gpu_options=gpu_options) KB.set_session(tf.Session(config=config)) hvdsize = hvd.size() checkpt = args.checkpt filepath = checkpt batch_size = args.batch_size num_classes = 10 epochs = args.epochs datadir = args.datadir # The data, shuffled and split between train and test sets: if hvd.rank() == 0: # download only in rank0 i.e. single process (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) hvd_keras.allreduce([0], name="Barrier") if hvd.rank() != 0: # Data should be downloaded already so load in the other ranks. (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) train_samples = x_train.shape[0] test_samples = x_test.shape[0] steps_per_epoch = train_samples // batch_size // hvdsize print_rank0('{} train samples'.format(train_samples), hvd) print_rank0('{} test samples'.format(test_samples), hvd) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) if not args.use_dataset_api: traingen = ImageDataGenerator() if args.aug: print_rank0('Using real-time data augmentation.', hvd) # This will do preprocessing and realtime data augmentation: traingen = ImageDataGenerator( # set input mean to 0 over the dataset featurewise_center=False, # set each sample mean to 0 samplewise_center=False, # divide inputs by std of the dataset featurewise_std_normalization=False, # divide each input by its std samplewise_std_normalization=False, # apply ZCA whitening zca_whitening=False, # randomly rotate images in the range (degrees, 0 to 180) rotation_range=0, # randomly shift images horizontally (fraction of total width) width_shift_range=0.1, # randomly shift images vertically (fraction of total height) height_shift_range=0.1, # randomly flip images horizontal_flip=True, # randomly flip images vertical_flip=False) # Compute quantities required for feature-wise normalization # (std, mean, and principal components if ZCA whitening is applied) traingen.fit(x_train) model = make_model(x_train.shape[1:], num_classes, filepath) else: print_rank0('USING TF DATASET API.', hvd) dataset = wrap_as_tfdataset(x_train, y_train, args.aug, batch_size, gpu_local_rank, prefetch_to_device=True, comm=hvd_keras) iterator = dataset.make_one_shot_iterator() # Model creation using tensors from the get_next() graph node. inputs, targets = iterator.get_next() x_train_input = KL.Input(tensor=inputs) model_init = make_model(x_train_input, num_classes, filepath) x_train_out = model_init.output model = Model(inputs=[x_train_input], outputs=[x_train_out]) # Let's train the model using RMSprop lr = 0.0001 * hvdsize # opt = KO.RMSprop(lr=lr, decay=1e-6) # opt = hvd_keras.DistributedOptimizer(opt) opt = tf.train.RMSPropOptimizer(lr) # Add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) model.compile( loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'], target_tensors=None if not args.use_dataset_api else [targets]) if hvd.rank() == 0: model.summary() callbacks = [] if checkpt and hvd.rank() == 0: checkpoint = ModelCheckpoint(filepath, monitor='loss', mode='min', verbose=1, save_best_only=True) callbacks.append(checkpoint) if hvd.rank() == 0: callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)] # Broadcast initial variable states from rank 0 to all other procs. # This is necessary to ensure consistent initialization of all # workers when training is started with random weights or restored # from a checkpoint. # Callback when using horovod.keras as hvd # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) KB.get_session().run(hvd.broadcast_global_variables(0)) if not args.use_dataset_api: start_time = time.time() # Fit the model on the batches generated by traingen.flow(). model.fit_generator( traingen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(x_test, y_test) if hvd.rank() == 0 else None, verbose=hvd.rank() == 0, callbacks=callbacks) else: # augmentation incorporated in the Dataset pipeline start_time = time.time() # Validation during training can be incorporated via callback: # noqa ref: https://github.com/keras-team/keras/blob/c8bef99ec7a2032b9bea6e9a1260d05a2b6a80f1/examples/mnist_tfrecord.py#L56 model.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, verbose=hvd.rank() == 0, callbacks=callbacks) if hvd.rank() != 0: return elapsed_time = time.time() - start_time print('[{}] finished in {} s'.format('TRAINING', round(elapsed_time, 3))) test_model = model if args.use_dataset_api: # Create a test-model without Dataset pipeline in the model graph. test_model = make_model(x_test.shape[1:], num_classes) test_model.compile(loss=keras_losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) print('SETTING WEIGHTS FOR EVAL WITH DATASET API...') test_model.set_weights(model.get_weights()) print('WEIGHTS SET!!!') metrics = test_model.evaluate(x_test, y_test) print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))