Beispiel #1
0
def BuildModel(dataShape, modelName, learningRate):

    K.set_floatx('float16')
    K.set_epsilon(1e-4)

    input0 = tf.keras.Input(shape=(dataShape, dataShape, 3),
                            name='input_0',
                            dtype='float16')  #Scene color
    input1 = tf.keras.Input(shape=(dataShape, dataShape, 1),
                            name='input_1',
                            dtype='float16')  #Depth 0
    input2 = tf.keras.Input(shape=(dataShape, dataShape, 1),
                            name='input_2',
                            dtype='float16')  #Depth -1
    input3 = tf.keras.Input(shape=(dataShape, dataShape, 1),
                            name='input_3',
                            dtype='float16')  #Depth -2

    modelFunc = importlib.import_module('Models.' + modelName)

    model = modelFunc.MakeModel([input0, input1, input2, input3], dataShape,
                                modelName)
    print("Loaded model from disk")

    model.compile(loss=Loss,
                  optimizer=LossScaleOptimizer(
                      RMSprop(lr=learningRate, epsilon=1e-4), 1000))

    model.summary()

    return model
Beispiel #2
0
def _handle_fp16_and_distributed_optimizer(optimizer,
                                           lr_schedule,
                                           hvd_backend=None):
    if hvd_backend == "horovod":
        import horovod.tensorflow.keras as hvd
        from horovod.tensorflow import Compression
    elif hvd_backend == "byteps":
        import byteps.tensorflow.keras as hvd
        from byteps.tensorflow import Compression

    if hvd_backend:
        compression = Compression.none
        if compat.CUSTOM_GLOBAL_FLOATX == "float16":
            compression = Compression.fp16

    if lr_schedule is not None and hvd_backend is None:
        # TODO(ZhaoChengqi): pay attention to API changes
        optimizer._set_hyper("learning_rate", lr_schedule)
    # specify the following scenario
    # there is a bug under TF2.3+Horovod+fp16+XLA
    if compat.CUSTOM_GLOBAL_FLOATX == "float16":
        logging.info("NOTICE: using revised DynamicLossScale under fp16")
        revised_loss_scale = training_utils.RevisedDynamicLossScale()
        if hvd_backend:
            opt = LossScaleOptimizer(optimizer, loss_scale=1)
            opt = hvd.DistributedOptimizer(opt,
                                           compression=compression,
                                           sparse_as_dense=True)
            opt._loss_scale = revised_loss_scale
            for weight in loss_scale_module.get_loss_scale_weights(
                    opt._loss_scale):
                backend.track_variable(weight)
            opt._track_trackable(opt._loss_scale, 'loss_scale', overwrite=True)
        else:
            opt = LossScaleOptimizer(optimizer, loss_scale=revised_loss_scale)
        return opt
    return optimizer
Beispiel #3
0
def handle_fp16_and_distributed_optimizer(optimizer, lr_schedule, hvd_backend=None):
    if hvd_backend == "horovod":
        import horovod.tensorflow.keras as hvd
        from horovod.tensorflow import Compression
    elif hvd_backend == "byteps":
        import byteps.tensorflow.keras as hvd
        from byteps.tensorflow import Compression

    if hvd_backend:
        compression = Compression.none
        if compat.CUSTOM_GLOBAL_FLOATX == "float16":
            compression = Compression.fp16

    if lr_schedule is not None and hvd_backend is None:
        # TODO(ZhaoChengqi): pay attention to API changes
        optimizer._set_hyper("learning_rate", lr_schedule)
    # specify the following scenario
    if compat.CUSTOM_GLOBAL_FLOATX == "float16":
        if compat.IS_PREV_TF_2_4_0:
            from tensorflow.keras.mixed_precision.experimental import LossScaleOptimizer
            from tensorflow.python.keras import backend
            from tensorflow.python.training.experimental.loss_scale import get_loss_scale_weights

            revised_loss_scale = RevisedDynamicLossScale()
            if hvd_backend:
                opt = LossScaleOptimizer(optimizer, loss_scale=1)
                opt = hvd.DistributedOptimizer(opt, compression=compression, sparse_as_dense=True)
                opt._loss_scale = revised_loss_scale
                for weight in get_loss_scale_weights(opt._loss_scale):
                    backend.track_variable(weight)
                opt._track_trackable(opt._loss_scale, 'loss_scale', overwrite=True)
            else:
                opt = LossScaleOptimizer(optimizer, loss_scale=revised_loss_scale)
        else:
            if hvd_backend:
                opt = HorovodDistributedLossScaleOptimizer(inner_optimizer=optimizer,
                                                           compression=compression,
                                                           sparse_as_dense=True,
                                                           hvd_backend=hvd_backend)
            else:
                opt = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
                opt._loss_scale = RevisedDynamicLossScale(
                    initial_loss_scale=2 ** 15, growth_steps=2000, multiplier=2)
                opt._track_trackable(opt._loss_scale, "loss_scale", overwrite=True)
        return opt

    return optimizer
def run(flags_obj):
  """Run ResNet Cifar-10 training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
  keras_utils.set_session_config(
      enable_eager=flags_obj.enable_eager,
      enable_xla=flags_obj.enable_xla,
      enable_grappler_layout_optimizer=
      flags_obj.enable_grappler_layout_optimizer)

  # Execute flag override logic for better model performance
  if flags_obj.tf_gpu_thread_mode:
    keras_common.set_gpu_thread_mode_and_count(flags_obj)
  keras_common.set_cudnn_batchnorm_mode()

  dtype = flags_core.get_tf_dtype(flags_obj)
  if dtype == 'fp16':
    raise ValueError('dtype fp16 is not supported in Keras. Use the default '
                     'value(fp32).')

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  tf.keras.backend.set_image_data_format(data_format)

  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=flags_obj.distribution_strategy,
      num_gpus=flags_obj.num_gpus,
      num_workers=distribution_utils.configure_cluster(),
      all_reduce_alg=flags_obj.all_reduce_alg,
      num_packs=flags_obj.num_packs)

  if strategy:
    # flags_obj.enable_get_next_as_optional controls whether enabling
    # get_next_as_optional behavior in DistributedIterator. If true, last
    # partial batch can be supported.
    strategy.extended.experimental_enable_get_next_as_optional = (
        flags_obj.enable_get_next_as_optional
    )

  strategy_scope = distribution_utils.get_strategy_scope(strategy)

  if flags_obj.use_synthetic_data:
    distribution_utils.set_up_synthetic_data()
    input_fn = keras_common.get_synth_input_fn(
        height=cifar_main.HEIGHT,
        width=cifar_main.WIDTH,
        num_channels=cifar_main.NUM_CHANNELS,
        num_classes=cifar_main.NUM_CLASSES,
        dtype=flags_core.get_tf_dtype(flags_obj),
        drop_remainder=True)
  else:
    distribution_utils.undo_set_up_synthetic_data()
    input_fn = cifar_main.input_fn

  train_input_dataset = input_fn(
      is_training=True,
      data_dir=flags_obj.data_dir,
      batch_size=flags_obj.batch_size,
      num_epochs=flags_obj.train_epochs,
      parse_record_fn=parse_record_keras,
      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
      dtype=dtype,
      # Setting drop_remainder to avoid the partial batch logic in normalization
      # layer, which triggers tf.where and leads to extra memory copy of input
      # sizes between host and GPU.
      drop_remainder=(not flags_obj.enable_get_next_as_optional))

  eval_input_dataset = None
  if not flags_obj.skip_eval:
    eval_input_dataset = input_fn(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
        num_epochs=flags_obj.train_epochs,
        parse_record_fn=parse_record_keras)

  with strategy_scope:
    optimizer = keras_common.get_optimizer()
    from tensorflow.keras.mixed_precision.experimental import LossScaleOptimizer
    loss_scale_opt = LossScaleOptimizer(optimizer, loss_scale=10000)
    model = resnet_cifar_model.resnet20(classes=cifar_main.NUM_CLASSES)

    model.compile(loss='categorical_crossentropy',
                  optimizer=loss_scale_opt,
                  metrics=(['categorical_accuracy']
                           if flags_obj.report_accuracy_metrics else None),
                  run_eagerly=flags_obj.run_eagerly,
                  run_distributed=flags_obj.force_v2_in_keras_compile)

  callbacks = keras_common.get_callbacks(
      learning_rate_schedule, cifar_main.NUM_IMAGES['train'])

  train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
  train_epochs = flags_obj.train_epochs

  if flags_obj.train_steps:
    train_steps = min(flags_obj.train_steps, train_steps)
    train_epochs = 1

  num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
                    flags_obj.batch_size)

  validation_data = eval_input_dataset
  if flags_obj.skip_eval:
    if flags_obj.set_learning_phase_to_train:
      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
      # not using distribution strategy.
      tf.keras.backend.set_learning_phase(1)
    num_eval_steps = None
    validation_data = None

  if not strategy and flags_obj.explicit_gpu_placement:
    # TODO(b/135607227): Add device scope automatically in Keras training loop
    # when not using distribition strategy.
    no_dist_strat_device = tf.device('/device:GPU:0')
    no_dist_strat_device.__enter__()

  #tensorboard
  from tensorflow.python.keras.callbacks import TensorBoard
  from time import time
  tensorboard = TensorBoard(log_dir="log/{}".format(time()), write_grads=False, histogram_freq=1)
  callbacks += [tensorboard]


  history = model.fit(train_input_dataset,
                      epochs=train_epochs,
                      steps_per_epoch=train_steps,
                      callbacks=callbacks,
                      validation_steps=num_eval_steps,
                      validation_data=validation_data,
                      validation_freq=1,
                      verbose=2)
  eval_output = None
  if not flags_obj.skip_eval:
    eval_output = model.evaluate(eval_input_dataset,
                                 steps=num_eval_steps,
                                 verbose=2)

  if not strategy and flags_obj.explicit_gpu_placement:
    no_dist_strat_device.__exit__()

  stats = keras_common.build_stats(history, eval_output, callbacks)
  return stats
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla,
                                   enable_grappler_layout_optimizer=flags_obj.
                                   enable_grappler_layout_optimizer)

    # Execute flag override logic for better model performance
    if flags_obj.tf_gpu_thread_mode:
        keras_common.set_gpu_thread_mode_and_count(flags_obj)
    if flags_obj.data_delay_prefetch:
        keras_common.data_delay_prefetch()
    keras_common.set_cudnn_batchnorm_mode()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'float16':
        policy = tf.keras.mixed_precision.experimental.Policy(
            'infer_float32_vars')
        tf.keras.mixed_precision.experimental.set_policy(policy)

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        num_workers=distribution_utils.configure_cluster(),
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs)

    if strategy:
        # flags_obj.enable_get_next_as_optional controls whether enabling
        # get_next_as_optional behavior in DistributedIterator. If true, last
        # partial batch can be supported.
        strategy.extended.experimental_enable_get_next_as_optional = (
            flags_obj.enable_get_next_as_optional)

    strategy_scope = distribution_utils.get_strategy_scope(strategy)

    # pylint: disable=protected-access
    if flags_obj.use_synthetic_data:
        distribution_utils.set_up_synthetic_data()
        input_fn = keras_common.get_synth_input_fn(
            height=imagenet_main.DEFAULT_IMAGE_SIZE,
            width=imagenet_main.DEFAULT_IMAGE_SIZE,
            num_channels=imagenet_main.NUM_CHANNELS,
            num_classes=imagenet_main.NUM_CLASSES,
            dtype=dtype,
            drop_remainder=True)
    else:
        distribution_utils.undo_set_up_synthetic_data()
        input_fn = imagenet_main.input_fn

    # When `enable_xla` is True, we always drop the remainder of the batches
    # in the dataset, as XLA-GPU doesn't support dynamic shapes.
    drop_remainder = flags_obj.enable_xla

    train_input_dataset = input_fn(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
        num_epochs=flags_obj.train_epochs,
        parse_record_fn=parse_record_keras,
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        dtype=dtype,
        drop_remainder=drop_remainder,
        tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
    )

    eval_input_dataset = None
    if not flags_obj.skip_eval:
        eval_input_dataset = input_fn(is_training=False,
                                      data_dir=flags_obj.data_dir,
                                      batch_size=flags_obj.batch_size,
                                      num_epochs=flags_obj.train_epochs,
                                      parse_record_fn=parse_record_keras,
                                      dtype=dtype,
                                      drop_remainder=drop_remainder)

    lr_schedule = 0.1
    if flags_obj.use_tensor_lr:
        lr_schedule = keras_common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_main.NUM_IMAGES['train'],
            warmup_epochs=LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in LR_SCHEDULE),
            compute_lr_on_cpu=True)

    with strategy_scope:
        optimizer = keras_common.get_optimizer(lr_schedule)
        from tensorflow.keras.mixed_precision.experimental import LossScaleOptimizer
        optimizer = LossScaleOptimizer(optimizer, loss_scale=256)
        # if dtype == 'float16':
        # TODO(reedwm): Remove manually wrapping optimizer once mixed precision
        # can be enabled with a single line of code.
        # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        # optimizer, loss_scale=flags_core.get_loss_scale(flags_obj,
        # default_for_fp16=128))

        if flags_obj.use_trivial_model:
            model = trivial_model.trivial_model(imagenet_main.NUM_CLASSES,
                                                dtype)
        else:
            model = resnet_model.resnet18(
                num_classes=imagenet_main.NUM_CLASSES, dtype=dtype)
            # model = alex_net.alex_net(num_classes=imagenet_main.NUM_CLASSES, dtype=dtype)

        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=(['sparse_categorical_accuracy']
                               if flags_obj.report_accuracy_metrics else None),
                      run_eagerly=flags_obj.run_eagerly,
                      run_distributed=flags_obj.force_v2_in_keras_compile)

    callbacks = keras_common.get_callbacks(learning_rate_schedule,
                                           imagenet_main.NUM_IMAGES['train'])
    from tensorflow.python.keras.callbacks import TensorBoard
    from time import time
    tensorboard = TensorBoard(log_dir="log/{}".format(time()))
    callbacks += [tensorboard]

    train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        # Only build the training graph. This reduces memory usage introduced by
        # control flow ops in layers that have different implementations for
        # training and inference (e.g., batch norm).
        if flags_obj.set_learning_phase_to_train:
            # TODO(haoyuzhang): Understand slowdown of setting learning phase when
            # not using distribution strategy.
            tf.keras.backend.set_learning_phase(1)
        num_eval_steps = None
        validation_data = None

    if not strategy and flags_obj.explicit_gpu_placement:
        # TODO(b/135607227): Add device scope automatically in Keras training loop
        # when not using distribition strategy.
        no_dist_strat_device = tf.device('/device:GPU:0')
        no_dist_strat_device.__enter__()

    history = model.fit(train_input_dataset,
                        epochs=train_epochs,
                        steps_per_epoch=train_steps,
                        callbacks=callbacks,
                        validation_steps=num_eval_steps,
                        validation_data=validation_data,
                        validation_freq=flags_obj.epochs_between_evals,
                        verbose=2)

    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=2)

    if not strategy and flags_obj.explicit_gpu_placement:
        no_dist_strat_device.__exit__()

    stats = keras_common.build_stats(history, eval_output, callbacks)
    return stats