Exemple #1
0
def define_resnet_flags(resnet_size_choices=None,
                        dynamic_loss_scale=False,
                        fp16_implementation=False):
    """Add flags and validators for ResNet."""
    flags_core.define_base(clean=True,
                           train_epochs=True,
                           epochs_between_evals=True,
                           stop_threshold=True,
                           num_gpu=True,
                           hooks=True,
                           export_dir=True,
                           distribution_strategy=True)
    flags_core.define_performance(num_parallel_calls=False,
                                  inter_op=True,
                                  intra_op=True,
                                  synthetic_data=True,
                                  dtype=True,
                                  all_reduce_alg=True,
                                  num_packs=True,
                                  tf_gpu_thread_mode=True,
                                  datasets_num_private_threads=True,
                                  dynamic_loss_scale=dynamic_loss_scale,
                                  fp16_implementation=fp16_implementation,
                                  loss_scale=True,
                                  tf_data_experimental_slack=True,
                                  max_train_steps=True)
    flags_core.define_image()
    flags_core.define_benchmark()
    flags_core.define_distribution()
    flags.adopt_module_key_flags(flags_core)

    flags.DEFINE_enum(
        name='resnet_version',
        short_name='rv',
        default='1',
        enum_values=['1', '2'],
        help=flags_core.help_wrap(
            'Version of ResNet. (1 or 2) See README.md for details.'))
    flags.DEFINE_bool(
        name='fine_tune',
        short_name='ft',
        default=False,
        help=flags_core.help_wrap(
            'If True do not train any parameters except for the final layer.'))
    flags.DEFINE_string(
        name='pretrained_model_checkpoint_path',
        short_name='pmcp',
        default=None,
        help=flags_core.help_wrap(
            'If not None initialize all the network except the final layer with '
            'these values'))
    flags.DEFINE_boolean(name='eval_only',
                         default=False,
                         help=flags_core.help_wrap(
                             'Skip training and only perform evaluation on '
                             'the latest checkpoint.'))
    flags.DEFINE_boolean(
        name='image_bytes_as_serving_input',
        default=False,
        help=flags_core.help_wrap(
            'If True exports savedmodel with serving signature that accepts '
            'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
            'represents the image. The former is easier to use for serving at '
            'the expense of image resize/cropping being done as part of model '
            'inference. Note, this flag only applies to ImageNet and cannot '
            'be used for CIFAR.'))
    flags.DEFINE_boolean(
        name='use_train_and_evaluate',
        default=False,
        help=flags_core.help_wrap(
            'If True, uses `tf.estimator.train_and_evaluate` for the training '
            'and evaluation loop, instead of separate calls to `classifier.train '
            'and `classifier.evaluate`, which is the default behavior.'))
    flags.DEFINE_bool(name='enable_lars',
                      default=False,
                      help=flags_core.help_wrap(
                          'Enable LARS optimizer for large batch training.'))
    flags.DEFINE_float(
        name='label_smoothing',
        default=0.0,
        help=flags_core.help_wrap(
            'Label smoothing parameter used in the softmax_cross_entropy'))
    flags.DEFINE_float(name='weight_decay',
                       default=1e-4,
                       help=flags_core.help_wrap(
                           'Weight decay coefficiant for l2 regularization.'))
    flags.DEFINE_float(name='percent',
                       default=0,
                       help=flags_core.help_wrap('percent of data to poison'))
    flags.DEFINE_bool(
        name='adv_train',
        default=False,
        help=flags_core.help_wrap('whether adversarial training'))

    choice_kwargs = dict(
        name='resnet_size',
        short_name='rs',
        default='50',
        help=flags_core.help_wrap('The size of the ResNet model to use.'))

    if resnet_size_choices is None:
        flags.DEFINE_string(**choice_kwargs)
    else:
        flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
Exemple #2
0
def define_keras_flags(dynamic_loss_scale=True):
    """Define flags for Keras models."""
    flags_core.define_base(run_eagerly=True)
    flags_core.define_performance(num_parallel_calls=False,
                                  synthetic_data=True,
                                  dtype=True,
                                  all_reduce_alg=True,
                                  num_packs=True,
                                  tf_gpu_thread_mode=True,
                                  datasets_num_private_threads=True,
                                  dynamic_loss_scale=dynamic_loss_scale,
                                  loss_scale=True,
                                  tf_data_experimental_slack=True,
                                  enable_xla=True,
                                  force_v2_in_keras_compile=True)
    flags_core.define_image()
    flags_core.define_benchmark()
    flags_core.define_distribution()
    flags.adopt_module_key_flags(flags_core)

    flags.DEFINE_boolean(name='enable_eager',
                         default=False,
                         help='Enable eager?')
    flags.DEFINE_boolean(name='skip_eval',
                         default=False,
                         help='Skip evaluation?')
    # TODO(b/135607288): Remove this flag once we understand the root cause of
    # slowdown when setting the learning phase in Keras backend.
    flags.DEFINE_boolean(
        name='set_learning_phase_to_train',
        default=True,
        help='If skip eval, also set Keras learning phase to 1 (training).')
    flags.DEFINE_boolean(
        name='explicit_gpu_placement',
        default=False,
        help='If not using distribution strategy, explicitly set device scope '
        'for the Keras training loop.')
    flags.DEFINE_boolean(name='use_trivial_model',
                         default=False,
                         help='Whether to use a trivial Keras model.')
    flags.DEFINE_boolean(name='report_accuracy_metrics',
                         default=True,
                         help='Report metrics during training and evaluation.')
    flags.DEFINE_boolean(
        name='use_tensor_lr',
        default=False,
        help='Use learning rate tensor instead of a callback.')
    flags.DEFINE_boolean(name='enable_tensorboard',
                         default=False,
                         help='Whether to enable Tensorboard callback.')
    flags.DEFINE_integer(
        name='train_steps',
        default=None,
        help='The number of steps to run for training. If it is larger than '
        '# batches per epoch, then use # batches per epoch. When this flag is '
        'set, only one epoch is going to run for training.')
    flags.DEFINE_string(
        name='profile_steps',
        default=None,
        help='Save profiling data to model dir at given range of steps. The '
        'value must be a comma separated pair of positive integers, specifying '
        'the first and last step to profile. For example, "--profile_steps=2,4" '
        'triggers the profiler to process 3 steps, starting from the 2nd step. '
        'Note that profiler has a non-trivial performance overhead, and the '
        'output file can be gigantic if profiling many steps.')
    flags.DEFINE_boolean(
        name='data_delay_prefetch',
        default=False,
        help=
        'Add a small delay in tf.data prefetch to prioritize memory copy of '
        'other tensors over the data minibatch for the (T+1)th step. It should '
        'help improve performance using EagerIterator and function. The codepath '
        'when enabling this feature is experimental and will be removed once the '
        'corresponding performance features are fully supported in TensorFlow.'
    )
    flags.DEFINE_boolean(
        name='batchnorm_spatial_persistent',
        default=True,
        help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
    flags.DEFINE_boolean(
        name='enable_get_next_as_optional',
        default=False,
        help='Enable get_next_as_optional behavior in DistributedIterator.')
Exemple #3
0
def define_keras_flags(dynamic_loss_scale=True,
                       model=False,
                       optimizer=False,
                       pretrained_filepath=False):
    """Define flags for Keras models."""
    flags_core.define_base(clean=True,
                           num_gpu=True,
                           run_eagerly=True,
                           train_epochs=True,
                           epochs_between_evals=True,
                           distribution_strategy=True)
    flags_core.define_performance(num_parallel_calls=False,
                                  synthetic_data=True,
                                  dtype=True,
                                  all_reduce_alg=True,
                                  num_packs=True,
                                  tf_gpu_thread_mode=True,
                                  datasets_num_private_threads=True,
                                  dynamic_loss_scale=dynamic_loss_scale,
                                  loss_scale=True,
                                  fp16_implementation=True,
                                  tf_data_experimental_slack=True,
                                  enable_xla=True,
                                  force_v2_in_keras_compile=True,
                                  training_dataset_cache=True)
    flags_core.define_image()
    flags_core.define_benchmark()
    flags_core.define_distribution()
    flags.adopt_module_key_flags(flags_core)

    flags.DEFINE_boolean(name='enable_eager',
                         default=False,
                         help='Enable eager?')
    flags.DEFINE_boolean(name='skip_eval',
                         default=False,
                         help='Skip evaluation?')
    # TODO(b/135607288): Remove this flag once we understand the root cause of
    # slowdown when setting the learning phase in Keras backend.
    flags.DEFINE_boolean(
        name='set_learning_phase_to_train',
        default=True,
        help='If skip eval, also set Keras learning phase to 1 (training).')
    flags.DEFINE_boolean(
        name='explicit_gpu_placement',
        default=False,
        help='If not using distribution strategy, explicitly set device scope '
        'for the Keras training loop.')
    flags.DEFINE_boolean(name='use_trivial_model',
                         default=False,
                         help='Whether to use a trivial Keras model.')
    flags.DEFINE_boolean(name='report_accuracy_metrics',
                         default=True,
                         help='Report metrics during training and evaluation.')
    flags.DEFINE_boolean(
        name='use_tensor_lr',
        default=False,
        help='Use learning rate tensor instead of a callback.')
    flags.DEFINE_boolean(name='enable_tensorboard',
                         default=False,
                         help='Whether to enable Tensorboard callback.')
    flags.DEFINE_integer(
        name='train_steps',
        default=None,
        help='The number of steps to run for training. If it is larger than '
        '# batches per epoch, then use # batches per epoch. This flag will be '
        'ignored if train_epochs is set to be larger than 1. ')
    flags.DEFINE_string(
        name='profile_steps',
        default=None,
        help=
        'Save profiling data to model dir at given range of global steps. The '
        'value must be a comma separated pair of positive integers, specifying '
        'the first and last step to profile. For example, "--profile_steps=2,4" '
        'triggers the profiler to process 3 steps, starting from the 2nd step. '
        'Note that profiler has a non-trivial performance overhead, and the '
        'output file can be gigantic if profiling many steps.')
    flags.DEFINE_boolean(
        name='batchnorm_spatial_persistent',
        default=True,
        help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
    flags.DEFINE_boolean(
        name='enable_get_next_as_optional',
        default=False,
        help='Enable get_next_as_optional behavior in DistributedIterator.')
    flags.DEFINE_boolean(
        name='enable_checkpoint_and_export',
        default=False,
        help=
        'Whether to enable a checkpoint callback and export the savedmodel.')
    flags.DEFINE_string(name='tpu',
                        default='',
                        help='TPU address to connect to.')
    flags.DEFINE_integer(
        name='steps_per_loop',
        default=1,
        help='Number of steps per graph-mode loop. Only training step happens '
        'inside the loop. Callbacks will not be called inside. Will be capped at '
        'steps per epoch.')
    flags.DEFINE_boolean(
        name='use_tf_keras_layers',
        default=False,
        help='Whether to use tf.keras.layers instead of tf.python.keras.layers.'
        'It only changes imagenet resnet model layers for now. This flag is '
        'a temporal flag during transition to tf.keras.layers. Do not use this '
        'flag for external usage. this will be removed shortly.')

    if model:
        flags.DEFINE_string(
            'model', 'resnet50_v1.5',
            'Name of model preset. (mobilenet, resnet50_v1.5)')
    if optimizer:
        flags.DEFINE_string(
            'optimizer', 'resnet50_default', 'Name of optimizer preset. '
            '(mobilenet_default, resnet50_default)')
    if pretrained_filepath:
        flags.DEFINE_string('pretrained_filepath', '', 'Pretrained file path.')
def define_common_bert_flags():
    """Define common flags for BERT tasks."""
    flags_core.define_base(data_dir=False,
                           model_dir=True,
                           clean=False,
                           train_epochs=False,
                           epochs_between_evals=False,
                           stop_threshold=False,
                           batch_size=False,
                           num_gpu=True,
                           export_dir=False,
                           distribution_strategy=True,
                           run_eagerly=True)
    flags_core.define_distribution()
    flags.DEFINE_string('bert_config_file', None,
                        'Bert configuration file to define core bert layers.')
    flags.DEFINE_string(
        'model_export_path', None,
        'Path to the directory, where trainined model will be '
        'exported.')
    flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
    flags.DEFINE_string(
        'init_checkpoint', None,
        'Initial checkpoint (usually from a pre-trained BERT model).')
    flags.DEFINE_integer('num_train_epochs', 3,
                         'Total number of training epochs to perform.')
    flags.DEFINE_integer(
        'steps_per_loop', None,
        'Number of steps per graph-mode loop. Only training step '
        'happens inside the loop. Callbacks will not be called '
        'inside. If not set the value will be configured depending on the '
        'devices available.')
    flags.DEFINE_float('learning_rate', 5e-5,
                       'The initial learning rate for Adam.')
    flags.DEFINE_float('end_lr', 0.0,
                       'The end learning rate for learning rate decay.')
    flags.DEFINE_string(
        'optimizer_type', 'adamw',
        'The type of optimizer to use for training (adamw|lamb)')
    flags.DEFINE_boolean(
        'scale_loss', False,
        'Whether to divide the loss by number of replica inside the per-replica '
        'loss function.')
    flags.DEFINE_boolean(
        'use_keras_compile_fit', False,
        'If True, uses Keras compile/fit() API for training logic. Otherwise '
        'use custom training loop.')
    flags.DEFINE_string(
        'hub_module_url', None, 'TF-Hub path/url to Bert module. '
        'If specified, init_checkpoint flag should not be used.')
    flags.DEFINE_bool(
        'hub_module_trainable', True,
        'True to make keras layers in the hub module trainable.')

    flags_core.define_log_steps()

    # Adds flags for mixed precision and multi-worker training.
    flags_core.define_performance(
        num_parallel_calls=False,
        inter_op=False,
        intra_op=False,
        synthetic_data=False,
        max_train_steps=False,
        dtype=True,
        dynamic_loss_scale=True,
        loss_scale=True,
        all_reduce_alg=True,
        num_packs=False,
        tf_gpu_thread_mode=True,
        datasets_num_private_threads=True,
        enable_xla=True,
        fp16_implementation=True,
    )

    # Adds gin configuration flags.
    hyperparams_flags.define_gin_flags()
Exemple #5
0
def define_transformer_flags():
  """Add flags and flag validators for running transformer_main."""
  # Add common flags (data_dir, model_dir, etc.).
  flags_core.define_base(num_gpu=True, distribution_strategy=True)
  flags_core.define_performance(
      num_parallel_calls=True,
      inter_op=False,
      intra_op=False,
      synthetic_data=True,
      max_train_steps=False,
      dtype=True,
      loss_scale=True,
      all_reduce_alg=True,
      num_packs=True,
      tf_gpu_thread_mode=True,
      datasets_num_private_threads=True,
      enable_xla=True,
      force_v2_in_keras_compile=True,
      fp16_implementation=True
  )

  # Additional performance flags
  # TODO(b/76028325): Remove when generic layout optimizer is ready.
  flags.DEFINE_boolean(
      name='enable_grappler_layout_optimizer',
      default=True,
      help='Enable Grappler layout optimizer. Currently Grappler can '
           'de-optimize fp16 graphs by forcing NCHW layout for all '
           'convolutions and batch normalizations, and this flag allows to '
           'disable it.'
  )

  flags_core.define_benchmark()
  flags_core.define_device(tpu=True)
  flags_core.define_distribution()

  flags.DEFINE_integer(
      name='train_steps', short_name='ts', default=300000,
      help=flags_core.help_wrap('The number of steps used to train.'))
  flags.DEFINE_integer(
      name='steps_between_evals', short_name='sbe', default=1000,
      help=flags_core.help_wrap(
          'The Number of training steps to run between evaluations. This is '
          'used if --train_steps is defined.'))
  flags.DEFINE_boolean(
      name='enable_time_history', default=True,
      help='Whether to enable TimeHistory callback.')
  flags.DEFINE_boolean(
      name='enable_tensorboard', default=False,
      help='Whether to enable Tensorboard callback.')
  flags.DEFINE_boolean(
      name='enable_metrics_in_training', default=False,
      help='Whether to enable metrics during training.')
  flags.DEFINE_string(
      name='profile_steps', default=None,
      help='Save profiling data to model dir at given range of steps. The '
      'value must be a comma separated pair of positive integers, specifying '
      'the first and last step to profile. For example, "--profile_steps=2,4" '
      'triggers the profiler to process 3 steps, starting from the 2nd step. '
      'Note that profiler has a non-trivial performance overhead, and the '
      'output file can be gigantic if profiling many steps.')
  # Set flags from the flags_core module as 'key flags' so they're listed when
  # the '-h' flag is used. Without this line, the flags defined above are
  # only shown in the full `--helpful` help text.
  flags.adopt_module_key_flags(flags_core)

  # Add transformer-specific flags
  flags.DEFINE_enum(
      name='param_set', short_name='mp', default='big',
      enum_values=PARAMS_MAP.keys(),
      help=flags_core.help_wrap(
          'Parameter set to use when creating and training the model. The '
          'parameters define the input shape (batch size and max length), '
          'model configuration (size of embedding, # of hidden layers, etc.), '
          'and various other settings. The big parameter set increases the '
          'default batch size, embedding/hidden size, and filter size. For a '
          'complete list of parameters, please see model/model_params.py.'))

  flags.DEFINE_bool(
      name='static_batch', short_name='sb', default=False,
      help=flags_core.help_wrap(
          'Whether the batches in the dataset should have static shapes. In '
          'general, this setting should be False. Dynamic shapes allow the '
          'inputs to be grouped so that the number of padding tokens is '
          'minimized, and helps model training. In cases where the input shape '
          'must be static (e.g. running on TPU), this setting will be ignored '
          'and static batching will always be used.'))
  flags.DEFINE_integer(
      name='max_length', short_name='ml', default=256,
      help=flags_core.help_wrap(
          'Max sentence length for Transformer. Default is 256. Note: Usually '
          'it is more effective to use a smaller max length if static_batch is '
          'enabled, e.g. 64.'))

  # Flags for training with steps (may be used for debugging)
  flags.DEFINE_integer(
      name='validation_steps', short_name='vs', default=64,
      help=flags_core.help_wrap('The number of steps used in validation.'))

  # BLEU score computation
  flags.DEFINE_string(
      name='bleu_source', short_name='bls', default=None,
      help=flags_core.help_wrap(
          'Path to source file containing text translate when calculating the '
          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
      ))
  flags.DEFINE_string(
      name='bleu_ref', short_name='blr', default=None,
      help=flags_core.help_wrap(
          'Path to source file containing text translate when calculating the '
          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
      ))
  flags.DEFINE_string(
      name='vocab_file', short_name='vf', default=None,
      help=flags_core.help_wrap(
          'Path to subtoken vocabulary file. If data_download.py was used to '
          'download and encode the training data, look in the data_dir to find '
          'the vocab file.'))
  flags.DEFINE_string(
      name='mode', default='train',
      help=flags_core.help_wrap('mode: train, eval, or predict'))
  flags.DEFINE_bool(
      name='use_ctl',
      default=False,
      help=flags_core.help_wrap(
          'Whether the model runs with custom training loop.'))
  flags.DEFINE_integer(
      name='decode_batch_size',
      default=32,
      help=flags_core.help_wrap(
          'Global batch size used for Transformer autoregressive decoding on '
          'TPU.'))
  flags.DEFINE_integer(
      name='decode_max_length',
      default=97,
      help=flags_core.help_wrap(
          'Max sequence length of the decode/eval data. This is used by '
          'Transformer autoregressive decoding on TPU to have minimum '
          'paddings.'))
  flags.DEFINE_bool(
      name='padded_decode',
      default=False,
      help=flags_core.help_wrap(
          'Whether the autoregressive decoding runs with input data padded to '
          'the decode_max_length. For TPU/XLA-GPU runs, this flag has to be '
          'set due the static shape requirement. Although CPU/GPU could also '
          'use padded_decode, it has not been tested. In addition, this method '
          'will introduce unnecessary overheads which grow quadratically with '
          'the max sequence length.'))

  flags_core.set_defaults(data_dir='/tmp/translate_ende',
                          model_dir='/tmp/transformer_model',
                          batch_size=None)

  # pylint: disable=unused-variable
  @flags.multi_flags_validator(
      ['bleu_source', 'bleu_ref'],
      message='Both or neither --bleu_source and --bleu_ref must be defined.')
  def _check_bleu_files(flags_dict):
    return (flags_dict['bleu_source'] is None) == (
        flags_dict['bleu_ref'] is None)

  @flags.multi_flags_validator(
      ['bleu_source', 'bleu_ref', 'vocab_file'],
      message='--vocab_file must be defined if --bleu_source and --bleu_ref '
              'are defined.')
  def _check_bleu_vocab_file(flags_dict):
    if flags_dict['bleu_source'] and flags_dict['bleu_ref']:
      return flags_dict['vocab_file'] is not None
    return True
Exemple #6
0
def define_keras_flags(dynamic_loss_scale=True):
    """Define flags for Keras models."""
    flags_core.define_base(
        clean=True,
        num_gpu=True,
        run_eagerly=True,
        train_epochs=True,
        epochs_between_evals=True,
        distribution_strategy=True,
    )
    flags_core.define_performance(
        num_parallel_calls=False,
        synthetic_data=True,
        dtype=True,
        all_reduce_alg=True,
        num_packs=True,
        tf_gpu_thread_mode=True,
        datasets_num_private_threads=True,
        dynamic_loss_scale=dynamic_loss_scale,
        loss_scale=True,
        fp16_implementation=True,
        tf_data_experimental_slack=True,
        enable_xla=True,
        force_v2_in_keras_compile=True,
        training_dataset_cache=True,
    )
    flags_core.define_image()
    flags_core.define_benchmark()
    flags_core.define_distribution()
    flags.adopt_module_key_flags(flags_core)

    flags.DEFINE_boolean(name="enable_eager",
                         default=False,
                         help="Enable eager?")
    flags.DEFINE_boolean(name="skip_eval",
                         default=False,
                         help="Skip evaluation?")
    # TODO(b/135607288): Remove this flag once we understand the root cause of
    # slowdown when setting the learning phase in Keras backend.
    flags.DEFINE_boolean(
        name="set_learning_phase_to_train",
        default=True,
        help="If skip eval, also set Keras learning phase to 1 (training).",
    )
    flags.DEFINE_boolean(
        name="explicit_gpu_placement",
        default=False,
        help="If not using distribution strategy, explicitly set device scope "
        "for the Keras training loop.",
    )
    flags.DEFINE_boolean(name="use_trivial_model",
                         default=False,
                         help="Whether to use a trivial Keras model.")
    flags.DEFINE_boolean(
        name="report_accuracy_metrics",
        default=True,
        help="Report metrics during training and evaluation.",
    )
    flags.DEFINE_boolean(
        name="use_tensor_lr",
        default=False,
        help="Use learning rate tensor instead of a callback.")
    flags.DEFINE_boolean(name="enable_tensorboard",
                         default=False,
                         help="Whether to enable Tensorboard callback.")
    flags.DEFINE_integer(
        name="train_steps",
        default=None,
        help="The number of steps to run for training. If it is larger than "
        "# batches per epoch, then use # batches per epoch. This flag will be "
        "ignored if train_epochs is set to be larger than 1. ",
    )
    flags.DEFINE_string(
        name="profile_steps",
        default=None,
        help="Save profiling data to model dir at given range of steps. The "
        "value must be a comma separated pair of positive integers, specifying "
        'the first and last step to profile. For example, "--profile_steps=2,4" '
        "triggers the profiler to process 3 steps, starting from the 2nd step. "
        "Note that profiler has a non-trivial performance overhead, and the "
        "output file can be gigantic if profiling many steps.",
    )
    flags.DEFINE_boolean(
        name="data_delay_prefetch",
        default=False,
        help=
        "Add a small delay in tf.data prefetch to prioritize memory copy of "
        "other tensors over the data minibatch for the (T+1)th step. It should "
        "help improve performance using EagerIterator and function. The codepath "
        "when enabling this feature is experimental and will be removed once the "
        "corresponding performance features are fully supported in TensorFlow.",
    )
    flags.DEFINE_boolean(
        name="batchnorm_spatial_persistent",
        default=True,
        help="Enable the spacial persistent mode for CuDNN batch norm kernel.",
    )
    flags.DEFINE_boolean(
        name="enable_get_next_as_optional",
        default=False,
        help="Enable get_next_as_optional behavior in DistributedIterator.",
    )
    flags.DEFINE_boolean(
        name="enable_checkpoint_and_export",
        default=False,
        help=
        "Whether to enable a checkpoint callback and export the savedmodel.",
    )
    flags.DEFINE_string(name="tpu",
                        default="",
                        help="TPU address to connect to.")
    flags.DEFINE_integer(
        name="steps_per_loop",
        default=1,
        help="Number of steps per graph-mode loop. Only training step happens "
        "inside the loop. Callbacks will not be called inside. Will be capped at "
        "steps per epoch.",
    )
Exemple #7
0
def define_common_bert_flags():
  """Define common flags for BERT tasks."""
  flags_core.define_base(
      data_dir=False,
      model_dir=True,
      clean=False,
      train_epochs=False,
      epochs_between_evals=False,
      stop_threshold=False,
      batch_size=False,
      num_gpu=True,
      hooks=False,
      export_dir=False,
      distribution_strategy=True,
      run_eagerly=True)
  flags_core.define_distribution()
  flags.DEFINE_string('bert_config_file', None,
                      'Bert configuration file to define core bert layers.')
  flags.DEFINE_string(
      'model_export_path', None,
      'Path to the directory, where trainined model will be '
      'exported.')
  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
  flags.DEFINE_string(
      'init_checkpoint', None,
      'Initial checkpoint (usually from a pre-trained BERT model).')
  flags.DEFINE_integer('num_train_epochs', 3,
                       'Total number of training epochs to perform.')
  flags.DEFINE_integer(
      'steps_per_loop', 200,
      'Number of steps per graph-mode loop. Only training step '
      'happens inside the loop. Callbacks will not be called '
      'inside.')
  flags.DEFINE_float('learning_rate', 5e-5,
                     'The initial learning rate for Adam.')
  flags.DEFINE_boolean(
      'scale_loss', False,
      'Whether to divide the loss by number of replica inside the per-replica '
      'loss function.')
  flags.DEFINE_boolean(
      'use_keras_compile_fit', False,
      'If True, uses Keras compile/fit() API for training logic. Otherwise '
      'use custom training loop.')
  flags.DEFINE_string(
      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
      'If specified, init_checkpoint flag should not be used.')
  flags.DEFINE_enum(
      'model_type', 'bert', ['bert', 'albert'],
      'Specifies the type of the model. '
      'If "bert", will use canonical BERT; if "albert", will use ALBERT model.')

  # Adds flags for mixed precision training.
  flags_core.define_performance(
      num_parallel_calls=False,
      inter_op=False,
      intra_op=False,
      synthetic_data=False,
      max_train_steps=False,
      dtype=True,
      dynamic_loss_scale=True,
      loss_scale=True,
      all_reduce_alg=False,
      num_packs=False,
      enable_xla=True,
      fp16_implementation=True,
  )
Exemple #8
0
def define_keras_flags(dynamic_loss_scale=True,
                       model=False,
                       optimizer=False,
                       pretrained_filepath=False):
    """Define flags for Keras models."""
    flags_core.define_base(clean=True,
                           num_gpu=True,
                           run_eagerly=True,
                           train_epochs=True,
                           epochs_between_evals=True,
                           distribution_strategy=True)
    flags_core.define_performance(num_parallel_calls=False,
                                  synthetic_data=True,
                                  dtype=True,
                                  all_reduce_alg=True,
                                  num_packs=True,
                                  tf_gpu_thread_mode=True,
                                  datasets_num_private_threads=True,
                                  dynamic_loss_scale=dynamic_loss_scale,
                                  loss_scale=True,
                                  fp16_implementation=True,
                                  tf_data_experimental_slack=True,
                                  enable_xla=True,
                                  training_dataset_cache=True)
    flags_core.define_image()
    flags_core.define_benchmark()
    flags_core.define_distribution()
    flags.adopt_module_key_flags(flags_core)

    flags.DEFINE_boolean(name='enable_eager',
                         default=False,
                         help='Enable eager?')
    flags.DEFINE_boolean(name='skip_eval',
                         default=False,
                         help='Skip evaluation?')
    # TODO(b/135607288): Remove this flag once we understand the root cause of
    # slowdown when setting the learning phase in Keras backend.
    flags.DEFINE_boolean(
        name='set_learning_phase_to_train',
        default=True,
        help='If skip eval, also set Keras learning phase to 1 (training).')
    flags.DEFINE_boolean(
        name='explicit_gpu_placement',
        default=False,
        help='If not using distribution strategy, explicitly set device scope '
        'for the Keras training loop.')
    flags.DEFINE_boolean(name='use_trivial_model',
                         default=False,
                         help='Whether to use a trivial Keras model.')
    flags.DEFINE_boolean(name='report_accuracy_metrics',
                         default=True,
                         help='Report metrics during training and evaluation.')
    flags.DEFINE_boolean(
        name='use_tensor_lr',
        default=True,
        help='Use learning rate tensor instead of a callback.')
    flags.DEFINE_boolean(name='enable_tensorboard',
                         default=False,
                         help='Whether to enable Tensorboard callback.')
    flags.DEFINE_integer(
        name='train_steps',
        default=None,
        help='The number of steps to run for training. If it is larger than '
        '# batches per epoch, then use # batches per epoch. This flag will be '
        'ignored if train_epochs is set to be larger than 1. ')
    flags.DEFINE_boolean(
        name='batchnorm_spatial_persistent',
        default=True,
        help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
    flags.DEFINE_boolean(
        name='enable_get_next_as_optional',
        default=False,
        help='Enable get_next_as_optional behavior in DistributedIterator.')
    flags.DEFINE_boolean(
        name='enable_checkpoint_and_export',
        default=False,
        help=
        'Whether to enable a checkpoint callback and export the savedmodel.')
    flags.DEFINE_string(name='tpu',
                        default='',
                        help='TPU address to connect to.')
    flags.DEFINE_integer(
        name='steps_per_loop',
        default=500,
        help='Number of steps per training loop. Only training step happens '
        'inside the loop. Callbacks will not be called inside. Will be capped at '
        'steps per epoch.')
    flags.DEFINE_boolean(
        name='use_tf_while_loop',
        default=True,
        help='Whether to build a tf.while_loop inside the training loop on the '
        'host. Setting it to True is critical to have peak performance on '
        'TPU.')

    if model:
        flags.DEFINE_string(
            'model', 'resnet50_v1.5',
            'Name of model preset. (mobilenet, resnet50_v1.5)')
    if optimizer:
        flags.DEFINE_string(
            'optimizer', 'resnet50_default', 'Name of optimizer preset. '
            '(mobilenet_default, resnet50_default)')
        # TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
        flags.DEFINE_float(
            'initial_learning_rate_per_sample', 0.00007,
            'Initial value of learning rate per sample for '
            'mobilenet_default.')
        flags.DEFINE_float(
            'lr_decay_factor', 0.94,
            'Learning rate decay factor for mobilenet_default.')
        flags.DEFINE_float(
            'num_epochs_per_decay', 2.5,
            'Number of epochs per decay for mobilenet_default.')
    if pretrained_filepath:
        flags.DEFINE_string('pretrained_filepath', '', 'Pretrained file path.')