Ejemplo n.º 1
0
    def test_load_model(self):
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            with temppath() as fname:
                model.save(fname)

                new_model = hvd.load_model(fname)
                new_opt = new_model.optimizer

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'RMSprop')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self._check_optimizer_weights(opt, new_opt)
Ejemplo n.º 2
0
    def test_load_model_custom_optimizers(self):
        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            with temppath() as fname:
                model.save(fname)

                custom_optimizers = [TestOptimizer]
                new_model = hvd.load_model(fname, custom_optimizers=custom_optimizers)
                new_opt = new_model.optimizer

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self._check_optimizer_weights(opt, new_opt)
Ejemplo n.º 3
0
    def test_load_model(self):
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            _, fname = tempfile.mkstemp('.h5')
            model.save(fname)

            new_model = hvd.load_model(fname)
            new_opt = new_model.optimizer
            os.remove(fname)

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'RMSprop')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self.assertEqual(len(opt.get_weights()),
                             len(new_opt.get_weights()))
            for weights, new_weights in zip(opt.get_weights(),
                                            new_opt.get_weights()):
                self.assertListEqual(weights.tolist(), new_weights.tolist())
Ejemplo n.º 4
0
    def test_load_model_broadcast(self):
        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            model = create_model()

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            if hvd.rank() == 0:
                _, fname = tempfile.mkstemp('.h5')
                model.save(fname)

        K.clear_session()
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            if hvd.rank() == 0:
                model = hvd.load_model(fname)
                os.remove(fname)
            else:
                model = create_model()

            def generator():
                while 1:
                    yield (x, y)

            if hvd.rank() == 0:
                self.assertEqual(len(model.optimizer.weights), 5)
            else:
                self.assertEqual(len(model.optimizer.weights), 0)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)

            self.assertEqual(len(model.optimizer.weights), 5)
Ejemplo n.º 5
0
    def test_load_model_custom_objects(self):
        hvd.init()

        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session() as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            _, fname = tempfile.mkstemp('.h5')
            model.save(fname)

            custom_objects = {
                'TestOptimizer':
                lambda **kwargs: hvd.DistributedOptimizer(
                    TestOptimizer(**kwargs))
            }
            new_model = hvd.load_model(fname, custom_objects=custom_objects)
            new_opt = new_model.optimizer
            os.remove(fname)

            self.assertEqual(type(new_opt).__module__, 'horovod.keras.impl')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self.assertEqual(len(opt.get_weights()),
                             len(new_opt.get_weights()))
            for weights, new_weights in zip(opt.get_weights(),
                                            new_opt.get_weights()):
                self.assertListEqual(weights.tolist(), new_weights.tolist())
Ejemplo n.º 6
0
    def test_load_model_custom_objects(self):
        hvd.init()

        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session() as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            _, fname = tempfile.mkstemp('.h5')
            model.save(fname)

            custom_objects = {
                'TestOptimizer': lambda **kwargs: hvd.DistributedOptimizer(
                    TestOptimizer(**kwargs))
            }
            new_model = hvd.load_model(fname, custom_objects=custom_objects)
            new_opt = new_model.optimizer
            os.remove(fname)

            self.assertEqual(type(new_opt).__module__, 'horovod.keras')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self.assertEqual(len(opt.get_weights()), len(new_opt.get_weights()))
            for weights, new_weights in zip(opt.get_weights(),
                                            new_opt.get_weights()):
                self.assertListEqual(weights.tolist(), new_weights.tolist())
Ejemplo n.º 7
0
                                     height_shift_range=0.2)
train_gen.fit(x_train)
train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size)

# Validation data iterator.
test_gen = image.ImageDataGenerator(featurewise_center=True,
                                    featurewise_std_normalization=True)
test_gen.mean = train_gen.mean
test_gen.std = train_gen.std
test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size)

# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast both model and optimizer weights
# to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
    model = hvd.load_model(
        args.checkpoint_format.format(epoch=resume_from_epoch))
else:
    # Set up standard WideResNet-16-10 model.
    model = WideResidualNetwork(depth=16,
                                width=10,
                                weights=None,
                                input_shape=input_shape,
                                classes=num_classes,
                                dropout_rate=0.01)

    # WideResNet model that is included with Keras is optimized for inference.
    # Add L2 weight decay & adjust BN settings.
    model_config = model.get_config()
    for layer, layer_config in zip(model.layers, model_config['layers']):
        if hasattr(layer, 'kernel_regularizer'):
            regularizer = keras.regularizers.l2(args.wd)
Ejemplo n.º 8
0
                                         target_size=(224, 224))

# Set up standard ResNet-50 model.
model = keras.applications.resnet50.ResNet50(weights=None)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: adjust learning rate based on number of GPUs.
initial_lr = args.base_lr * hvd.size()

# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast both model and optimizer weights
# to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
    model = hvd.load_model(args.checkpoint_format.format(epoch=resume_from_epoch),
                           compression=compression)
else:
    # ResNet-50 model that is included with Keras is optimized for inference.
    # Add L2 weight decay & adjust BN settings.
    model_config = model.get_config()
    for layer, layer_config in zip(model.layers, model_config['layers']):
        if hasattr(layer, 'kernel_regularizer'):
            regularizer = keras.regularizers.l2(args.wd)
            layer_config['config']['kernel_regularizer'] = \
                {'class_name': regularizer.__class__.__name__,
                 'config': regularizer.get_config()}
        if type(layer) == keras.layers.BatchNormalization:
            layer_config['config']['momentum'] = 0.9
            layer_config['config']['epsilon'] = 1e-5

    model = keras.models.Model.from_config(model_config)
Ejemplo n.º 9
0
    def test_load_model_broadcast(self):
        hvd.init()

        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with self.test_session() as sess:
            K.set_session(sess)

            model = create_model()

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            if hvd.rank() == 0:
                _, fname = tempfile.mkstemp('.h5')
                model.save(fname)

        K.clear_session()
        with self.test_session() as sess:
            K.set_session(sess)

            if hvd.rank() == 0:
                model = hvd.load_model(fname)
                os.remove(fname)
            else:
                model = create_model()

            def generator():
                while 1:
                    yield (x, y)

            if hvd.rank() == 0:
                self.assertEqual(len(model.optimizer.weights), 5)
            else:
                self.assertEqual(len(model.optimizer.weights), 0)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)

            self.assertEqual(len(model.optimizer.weights), 5)
Ejemplo n.º 10
0
                                           target_size=(224, 224))

# Validation data iterator.
test_gen = image.ImageDataGenerator(
    zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
test_iter = test_gen.flow_from_directory(test_dir, batch_size=batch_size,
                                         target_size=(224, 224))

# Set up standard ResNet-50 model.
model = keras.applications.resnet50.ResNet50(weights=None)

# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast both model and optimizer weights
# to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
    model = hvd.load_model(checkpoint_format.format(epoch=resume_from_epoch))
else:
    # ResNet-50 model that is included with Keras is optimized for inference.
    # Add L2 weight decay & adjust BN settings.
    model_config = model.get_config()
    for layer, layer_config in zip(model.layers, model_config['layers']):
        if hasattr(layer, 'kernel_regularizer'):
            regularizer = keras.regularizers.l2(weight_decay)
            layer_config['config']['kernel_regularizer'] = \
                {'class_name': regularizer.__class__.__name__,
                 'config': regularizer.get_config()}
        if type(layer) == keras.layers.BatchNormalization:
            layer_config['config']['momentum'] = 0.9
            layer_config['config']['epsilon'] = 1e-5

    model = keras.models.Model.from_config(model_config)
Ejemplo n.º 11
0
def main():
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.compat.v1.Session(config=config))

    # If set > 0, will resume training from a given checkpoint.
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                      0,
                                      name='resume_from_epoch')

    # Horovod: print logs on the first worker.
    verbose = 1 if hvd.rank() == 0 else 0

    # Training data iterator.
    train_gen = image.ImageDataGenerator()
    #width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
    #preprocessing_function=keras.applications.resnet50.preprocess_input)
    train_iter = train_gen.flow_from_directory(args.train,
                                               batch_size=args.batch_size,
                                               target_size=(224, 224))

    # Validation data iterator.
    test_gen = image.ImageDataGenerator()
    #zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
    test_iter = test_gen.flow_from_directory(args.val,
                                             batch_size=args.val_batch_size,
                                             target_size=(224, 224))

    # train iterator for tfrecord
    train_iter_tf = iterator(args.train_dir)
    val_iter_tf = iterator(args.val_dir)

    # timeline
    #timeline = tf.train.ProfilerHook(save_steps=500, output_dir='./timeline')
    #run_options  = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
    #run_metadata = tf.compat.v1.RunMetadata()

    # Set up standard ResNet-50 model.
    model = keras.applications.resnet50.ResNet50(weights=None)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # Horovod: restore on the first worker which will broadcast both model and optimizer weights
    # to other workers.
    if resume_from_epoch > 0 and hvd.rank() == 0:
        model = hvd.load_model(
            args.checkpoint_format.format(epoch=resume_from_epoch),
            compression=compression)
    else:
        # ResNet-50 model that is included with Keras is optimized for inference.
        # Add L2 weight decay & adjust BN settings.
        model_config = model.get_config()
        for layer, layer_config in zip(model.layers, model_config['layers']):
            if hasattr(layer, 'kernel_regularizer'):
                regularizer = keras.regularizers.l2(args.wd)
                layer_config['config']['kernel_regularizer'] = \
                    {'class_name': regularizer.__class__.__name__,
                     'config': regularizer.get_config()}
            if type(layer) == keras.layers.BatchNormalization:
                layer_config['config']['momentum'] = 0.9
                layer_config['config']['epsilon'] = 1e-5

        model = keras.models.Model.from_config(model_config)

        # Horovod: adjust learning rate based on number of GPUs.
        opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(),
                                   momentum=args.momentum)

        # Horovod: add Horovod Distributed Optimizer.
        opt = hvd.DistributedOptimizer(opt, compression=compression)

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=opt,
                      metrics=['accuracy', 'top_k_categorical_accuracy'])
        #              options=run_options,
        #              run_metadata=run_metadata
        #              )

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=args.warmup_epochs, verbose=verbose),

        # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
        hvd.callbacks.LearningRateScheduleCallback(
            start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=30,
                                                   end_epoch=60,
                                                   multiplier=1e-1),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=60,
                                                   end_epoch=80,
                                                   multiplier=1e-2),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=80,
                                                   multiplier=1e-3),
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(args.checkpoint_format))
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

    # Train the model. The training will randomly sample 1 / N batches of training data and
    # 3 / N batches of validation data on every worker, where N is the number of workers.
    # Over-sampling of validation data helps to increase probability that every validation
    # example will be evaluated.

    print('----  train  len------ :', len(train_iter))
    print('----  test   len------ :', len(test_iter))
    total_train_step = len(train_iter)
    total_val_step = len(test_iter)

    #model.fit_generator(train_iter,
    model.fit(
        train_iter_tf,
        #steps_per_epoch=40037 // hvd.size(),
        steps_per_epoch=total_train_step // hvd.size(),
        callbacks=callbacks,
        epochs=args.epochs,
        verbose=verbose,
        workers=8,
        initial_epoch=resume_from_epoch,
        validation_data=val_iter_tf,
        validation_steps=3 * total_val_step // hvd.size())

    # timeline tracing
    #trace = timeline.Timeline(step_stats=run_metadata.step_stats)
    #with open ('./timeline.keras.json','w') as f:
    #     f.write(trace.generate_chrome_trace_format())

    # Evaluate the model on the full data set.
    score = hvd.allreduce(
        model.evaluate_generator(test_iter, len(test_iter), workers=4))
    if verbose:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
    zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
test_iter = test_gen.flow_from_directory(args.val_dir,
                                         batch_size=args.val_batch_size,
                                         target_size=(224, 224))

# Set up standard ResNet-50 model.
model = keras.applications.resnet50.ResNet50(weights=None)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast both model and optimizer weights
# to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
    model = hvd.load_model(args.checkpoint_format.format(epoch=resume_from_epoch),
                           compression=compression)
else:
    # ResNet-50 model that is included with Keras is optimized for inference.
    # Add L2 weight decay & adjust BN settings.
    model_config = model.get_config()
    for layer, layer_config in zip(model.layers, model_config['layers']):
        if hasattr(layer, 'kernel_regularizer'):
            regularizer = keras.regularizers.l2(args.wd)
            layer_config['config']['kernel_regularizer'] = \
                {'class_name': regularizer.__class__.__name__,
                 'config': regularizer.get_config()}
        if type(layer) == keras.layers.BatchNormalization:
            layer_config['config']['momentum'] = 0.9
            layer_config['config']['epsilon'] = 1e-5

    model = keras.models.Model.from_config(model_config)
Ejemplo n.º 13
0
def train_and_predict():
    
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.intra_op_parallelism_threads = 10
    config.inter_op_parallelism_threads =  1
    K.set_session(tf.Session(config=config))

    print('-'*30)
    print('Loading and preprocessing train data...')
    print('-'*30)
    imgs_train, imgs_mask_train = load_train_data()
    imgs_mask_train=imgs_mask_train[..., np.newaxis]
    #imgs_train = preprocess(imgs_train,'I')
    #imgs_mask_train = preprocess(imgs_mask_train,'M')
#
    print(imgs_train.shape)
    print(imgs_mask_train.shape)
    imgs_train = imgs_train.astype('float32')
    #mean = np.mean(imgs_train)  # mean for data centering
    #std = np.std(imgs_train)  # std for data normalization

    #imgs_train -= mean
    #imgs_train /= std
    imgs_train /= 255.  # scale masks to [0, 1]

    imgs_mask_train = imgs_mask_train.astype('float32')
    imgs_mask_train /= 255.  # scale masks to [0, 1]

    print('-'*30)
    print('Creating and compiling model...')
    print('-'*30)
   
    #resume_from_epoch = 0
    #for try_epoch in range(100, 0, -1):
    #    if os.path.exists('/workspace/checkpoint-{epoch}.h5'.format(epoch=try_epoch)):
    #       resume_from_epoch = try_epoch
    #       break
    resume_from_epoch=int(sys.argv[1])
    print('resume_from_epoch:',resume_from_epoch)
    # resume from latest checkpoint file
    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
    
    verbose = 1 if hvd.rank() == 0 else 0
    
    if resume_from_epoch > 0 and hvd.rank() == 0:
       model = hvd.load_model('/workspace/nddcheckpoint-{epoch}.h5'.format(epoch=resume_from_epoch),custom_objects={'dice_coef':dice_coef,'dice_coef_loss':dice_coef_loss}) 
    else:
       model = get_unet()
    

    print('hvd size:',hvd.size())
    print('learning rate:',.00013*hvd.size())

    print('calculating data start and end indices to distribute data for each worker....')    
    if hvd.size() > 1:
       number_of_examples_per_rank=imgs_train.shape[0]//hvd.size()
       remainder=imgs_train.shape[0]%hvd.size()
    if hvd.rank() < remainder:
       start_index= hvd.rank() * (number_of_examples_per_rank+1)
       end_index= start_index + number_of_examples_per_rank + 1
    else:
       start_index= hvd.rank() * number_of_examples_per_rank + remainder
       end_index= start_index + number_of_examples_per_rank 
    print('Rank''s, Start and End Index:',hvd.rank(),start_index,end_index)

    callbacks = [
    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    if hvd.rank() == 0:
       callbacks.append(keras.callbacks.ModelCheckpoint('/workspace/nddcheckpoint-{epoch}.h5',monitor='val_loss', save_best_only=True))
    
    print('-'*30)
    print('Fitting model...')
    print('-'*30)
    model.fit(imgs_train[start_index:end_index], imgs_mask_train[start_index:end_index], batch_size=12, epochs=resume_from_epoch+10,  shuffle=True,
              validation_split=0.01,initial_epoch=resume_from_epoch,
              callbacks=callbacks,
              verbose=1 if hvd.rank() == 0 else 0)
              #verbose=1)
    if hvd.rank() == 0:
       model.save('/workspace/unetmodelfdd.h5', include_optimizer=False)