예제 #1
0
# BytePS: add BytePS Distributed Optimizer.
opt = bps.DistributedOptimizer(opt)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

callbacks = [
    # BytePS: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    bps.callbacks.BroadcastGlobalVariablesCallback(0),
]

# BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if bps.rank() == 0:
    callbacks.append(
        keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          callbacks=callbacks,
          epochs=epochs,
          verbose=1 if bps.rank() == 0 else 0,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
예제 #2
0
# BytePS: add BytePS Distributed Optimizer.
opt = bps.DistributedOptimizer(opt)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

callbacks = [
    # BytePS: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    bps.callbacks.BroadcastGlobalVariablesCallback(0),
]

# BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if bps.rank() == 0:
    callbacks.append(
        keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          callbacks=callbacks,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
예제 #3
0
config.gpu_options.visible_device_list = str(bps.local_rank())
K.set_session(tf.Session(config=config))

# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(args.epochs, 0, -1):
    if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# BytePS: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = bps.broadcast(resume_from_epoch, 0, name='resume_from_epoch')

# BytePS: print logs on the first worker.
verbose = 1 if bps.rank() == 0 else 0

# Training data iterator.
train_gen = image.ImageDataGenerator(
    width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
    preprocessing_function=keras.applications.resnet50.preprocess_input)
train_iter = train_gen.flow_from_directory(args.train_dir,
                                           batch_size=args.batch_size,
                                           target_size=(224, 224))

# Validation data iterator.
test_gen = image.ImageDataGenerator(
    zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
test_iter = test_gen.flow_from_directory(args.val_dir,
                                         batch_size=args.val_batch_size,
                                         target_size=(224, 224))