# BytePS: add BytePS Distributed Optimizer. opt = bps.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [ # BytePS: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bps.callbacks.BroadcastGlobalVariablesCallback(0), ] # BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them. if bps.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1 if bps.rank() == 0 else 0, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1])
# BytePS: add BytePS Distributed Optimizer. opt = bps.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [ # BytePS: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bps.callbacks.BroadcastGlobalVariablesCallback(0), ] # BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them. if bps.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1])
config.gpu_options.visible_device_list = str(bps.local_rank()) K.set_session(tf.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # BytePS: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = bps.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # BytePS: print logs on the first worker. verbose = 1 if bps.rank() == 0 else 0 # Training data iterator. train_gen = image.ImageDataGenerator( width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, preprocessing_function=keras.applications.resnet50.preprocess_input) train_iter = train_gen.flow_from_directory(args.train_dir, batch_size=args.batch_size, target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator( zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(args.val_dir, batch_size=args.val_batch_size, target_size=(224, 224))