def input_fn_train(num_epochs):
   return input_function(
       is_training=True, data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       num_gpus=flags_core.get_num_gpus(flags_obj))
 def input_fn_eval():
   return input_function(
       is_training=False,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=1,
       dtype=flags_core.get_tf_dtype(flags_obj))
 def input_fn_train(num_epochs):
   return input_function(
       is_training=True,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       dtype=flags_core.get_tf_dtype(flags_obj),
       datasets_num_private_threads=flags_obj.datasets_num_private_threads,
       num_parallel_batches=flags_obj.datasets_num_parallel_batches)
def construct_estimator(flags_obj, params, schedule_manager):
  """Construct an estimator from either Estimator or TPUEstimator.

  Args:
    flags_obj: The FLAGS object parsed from command line.
    params: A dict of run specific parameters.
    schedule_manager: A schedule.Manager object containing the run schedule.

  Returns:
    An estimator object to be used for training and eval.
  """
  if not params["use_tpu"]:
    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_core.get_num_gpus(flags_obj),
        all_reduce_alg=flags_obj.all_reduce_alg)
    return tf.estimator.Estimator(
        model_fn=model_fn, model_dir=flags_obj.model_dir, params=params,
        config=tf.estimator.RunConfig(train_distribute=distribution_strategy))

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      tpu=flags_obj.tpu,
      zone=flags_obj.tpu_zone,
      project=flags_obj.tpu_gcp_project
  )

  tpu_config = tf.contrib.tpu.TPUConfig(
      iterations_per_loop=schedule_manager.single_iteration_train_steps,
      num_shards=flags_obj.num_tpu_shards)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=flags_obj.model_dir,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      tpu_config=tpu_config)

  return tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL,
      train_batch_size=schedule_manager.batch_size,
      eval_batch_size=schedule_manager.batch_size,
      params={
          # TPUEstimator needs to populate batch_size itself due to sharding.
          key: value for key, value in params.items() if key != "batch_size"},
      config=run_config)
Exemple #5
0
def parse_flags(flags_obj):
  """Convenience function to turn flags into params."""
  num_gpus = flags_core.get_num_gpus(flags_obj)
  num_devices = FLAGS.num_tpu_shards if FLAGS.tpu else num_gpus or 1

  batch_size = (flags_obj.batch_size + num_devices - 1) // num_devices

  eval_divisor = (rconst.NUM_EVAL_NEGATIVES + 1) * num_devices
  eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
  eval_batch_size = ((eval_batch_size + eval_divisor - 1) //
                     eval_divisor * eval_divisor // num_devices)

  return {
      "train_epochs": flags_obj.train_epochs,
      "batches_per_step": num_devices,
      "use_seed": flags_obj.seed is not None,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": flags_obj.learning_rate,
      "mf_dim": flags_obj.num_factors,
      "model_layers": [int(layer) for layer in flags_obj.layers],
      "mf_regularization": flags_obj.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
      "num_neg": flags_obj.num_neg,
      "num_gpus": num_gpus,
      "use_tpu": flags_obj.tpu is not None,
      "tpu": flags_obj.tpu,
      "tpu_zone": flags_obj.tpu_zone,
      "tpu_gcp_project": flags_obj.tpu_gcp_project,
      "beta1": flags_obj.beta1,
      "beta2": flags_obj.beta2,
      "epsilon": flags_obj.epsilon,
      "match_mlperf": flags_obj.ml_perf,
      "use_xla_for_gpu": flags_obj.use_xla_for_gpu,
      "clone_model_in_keras_dist_strat":
          flags_obj.clone_model_in_keras_dist_strat,
      "epochs_between_evals": FLAGS.epochs_between_evals,
      "turn_off_distribution_strategy": FLAGS.turn_off_distribution_strategy,
  }
def run_mnist(flags_obj):
    """Run MNIST training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """
    model_helpers.apply_clean(flags_obj)
    model_function = model_fn

    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_core.get_num_gpus(flags_obj),
        all_reduce_alg=flags_obj.all_reduce_alg)

    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config)

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    mnist_classifier = tf.estimator.Estimator(model_fn=model_function,
                                              model_dir=flags_obj.model_dir,
                                              config=run_config,
                                              params={
                                                  'data_format': data_format,
                                              })

    # Set up training and evaluation input functions.
    def train_input_fn():
        """Prepare data for training."""

        # When choosing shuffle buffer sizes, larger sizes result in better
        # randomness, while smaller sizes use less memory. MNIST is a small
        # enough dataset that we can easily shuffle the full epoch.
        ds = dataset.train(flags_obj.data_dir)
        ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

        # Iterate through the dataset a set number (`epochs_between_evals`) of times
        # during each training session.
        ds = ds.repeat(flags_obj.epochs_between_evals)
        return ds

    def eval_input_fn():
        return dataset.test(flags_obj.data_dir).batch(
            flags_obj.batch_size).make_one_shot_iterator().get_next()

    # Set up hook that outputs training logs every 100 steps.
    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    # Train and evaluate model.
    for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
        mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
        eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
        print('\nEvaluation results:\n\t%s\n' % eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    # Export the model
    if flags_obj.export_dir is not None:
        image = tf.placeholder(tf.float32, [None, 28, 28])
        input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
            'image':
            image,
        })
        mnist_classifier.export_savedmodel(flags_obj.export_dir,
                                           input_fn,
                                           strip_default_attrs=True)
Exemple #7
0
def run_mnist(flags_obj):
  """Run MNIST training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  model_helpers.apply_clean(flags_obj)
  model_function = model_fn

  # Get number of GPUs as defined by the --num_gpus flags and the number of
  # GPUs available on the machine.
  num_gpus = flags_core.get_num_gpus(flags_obj)
  multi_gpu = num_gpus > 1

  if multi_gpu:
    # Validate that the batch size can be split into devices.
    distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus)

    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
    # and (2) wrap the optimizer. The first happens here, and (2) happens
    # in the model_fn itself when the optimizer is defined.
    model_function = tf.contrib.estimator.replicate_model_fn(
        model_fn, loss_reduction=tf.losses.Reduction.MEAN,
        devices=["/device:GPU:%d" % d for d in range(num_gpus)])

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
      model_fn=model_function,
      params={
          'data_format': data_format,
          'multi_gpu': multi_gpu
      })

  # Set up training and evaluation input functions.
  def train_input_fn():
    """Prepare data for training."""

    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes use less memory. MNIST is a small
    # enough dataset that we can easily shuffle the full epoch.
    ds = dataset.train(flags_obj.data_dir)

    def invert(image, label):
      return (image * -1.0) + 1.0, label

    def brightness(image, label):
      return tf.image.random_brightness(image, max_delta=0.2), label 

    if INVERT:
      inverted = ds.map(invert)
      ds = ds.concatenate(inverted)

    if BRIGHTNESS:
      ds = ds.concatenate(ds.map(brightness)) 

    ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

    # Iterate through the dataset a set number (`epochs_between_evals`) of times
    # during each training session.
    ds = ds.repeat(flags_obj.epochs_between_evals)
    return ds

  def eval_input_fn():
    return dataset.test(flags_obj.data_dir).batch(
        flags_obj.batch_size).make_one_shot_iterator().get_next()

  # Set up hook that outputs training logs every 100 steps.
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      batch_size=flags_obj.batch_size)

  # Train and evaluate model.
  for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('\nEvaluation results:\n\t%s\n' % eval_results)

    if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                         eval_results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    image = tf.placeholder(tf.float32, [None, 28, 28])
    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
        'image': image,
    })
    mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)

  def our_test_fn():
    images = []
    for i in list(range(1,10)) + ['dog']:
      images.append(np.array(imageio.imread('{}.png'.format(i)).ravel()/255.0, dtype='float32'))
    images = np.array(images)
    return tf.convert_to_tensor(images)

  # Check our own examples
  predictions = mnist_classifier.predict(input_fn=our_test_fn)
  table = []
  for i in list(range(1, 10)) + ['dog']:
    prediction = next(predictions)
    if i == 'dog':
      print("{}. CNN thinks it's a {} ({:.1f}%)".format(i, prediction['classes'], prediction['probabilities'][prediction['classes']]*100))
    else:
      print("{} at {:.1f}. CNN thinks it's a {} ({:.1f}%)".format(i, prediction['probabilities'][i]*100, prediction['classes'], prediction['probabilities'][prediction['classes']]*100))
    table.append((i, prediction['probabilities']))
Exemple #8
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config)

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj)
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train():
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=flags_obj.epochs_between_evals,
            num_gpus=flags_core.get_num_gpus(flags_obj))

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info('Starting a training cycle: %d/%d', cycle_index,
                        total_training_cycle)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        # flags_obj.max_train_steps is generally associated with testing and
        # profiling. As a result it is frequently called with synthetic data, which
        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
        # eval (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags_obj.max_train_steps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]  # 设置网络规模的种类,基础版还是高级版
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data  # 什么叫使用合成数据?

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_device_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(  # 用来管理训练进度的实例,例如steps,验证间隔步数等
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)  # 清理一下数据和之前保存的模型,但是需要在参数中指定允许清理

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(  # 好像是输出日志的时候需要这个实例
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()  # 还是用来输出日志的
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(
        flags_obj, params,
        schedule_manager)  # 返回一个tf.estimator.Estimator用来训练和验证模型
    run_loop(
        estimator=estimator,  # “估计器”,用来帮助训练和验证模型
        # Training arguments
        schedule_manager=schedule_manager,  # 用来管理训练过程的,训练多少steps,多久验证一次等
        train_hooks=train_hooks,  # 打日志的?
        benchmark_logger=benchmark_logger,  # 打日志的?
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,  # 3个关于bleu的文件
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)  # 词表文件

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file})
Exemple #10
0
 def input_fn_eval():
   return input_function(
       is_training=False, data_dir=flags_obj.data_dir,
       batch_size=per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=1)
Exemple #11
0
def run_deep_speech(_):
  """Run deep speech training and eval loop."""
  tf.set_random_seed(flags_obj.seed)
  # Data preprocessing
  tf.logging.info("Data preprocessing...")
  train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
  eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

  # Number of label classes. Label string is "[a-z]' -"
  num_classes = len(train_speech_dataset.speech_labels)

  # Use distribution strategy for multi-gpu training
  num_gpus = flags_core.get_num_gpus(flags_obj)
  distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus)
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      model_dir=flags_obj.model_dir,
      config=run_config,
      params={
          "num_classes": num_classes,
      }
  )

  # Benchmark logging
  run_params = {
      "batch_size": flags_obj.batch_size,
      "train_epochs": flags_obj.train_epochs,
      "rnn_hidden_size": flags_obj.rnn_hidden_size,
      "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
      "rnn_type": flags_obj.rnn_type,
      "is_bidirectional": flags_obj.is_bidirectional,
      "use_bias": flags_obj.use_bias
  }

  dataset_name = "LibriSpeech"
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info("deep_speech", dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  per_device_batch_size = distribution_utils.per_device_batch_size(
      flags_obj.batch_size, num_gpus)

  def input_fn_train():
    return dataset.input_fn(
        per_device_batch_size, train_speech_dataset)

  def input_fn_eval():
    return dataset.input_fn(
        per_device_batch_size, eval_speech_dataset)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: %d/%d",
                    cycle_index + 1, total_training_cycle)

    # Perform batch_wise dataset shuffling
    train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
        train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
        flags_obj.batch_size)

    estimator.train(input_fn=input_fn_train, hooks=train_hooks)

    # Evaluation
    tf.logging.info("Starting to evaluate...")

    eval_results = evaluate_model(
        estimator, eval_speech_dataset.speech_labels,
        eval_speech_dataset.entries, input_fn_eval)

    # Log the WER and CER results.
    benchmark_logger.log_evaluation_result(eval_results)
    tf.logging.info(
        "Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(
        flags_obj.wer_threshold, eval_results[_WER_KEY]):
      break
Exemple #12
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
  eval_batch_size = int(FLAGS.eval_batch_size or
                        max([FLAGS.batch_size, eval_per_user]))
  if eval_batch_size % eval_per_user:
    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
    tf.logging.warning(
        "eval examples per user does not evenly divide eval_batch_size. "
        "Overriding to {}".format(eval_batch_size))

  if FLAGS.use_synthetic_data:
    ncf_dataset = None
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        num_cycles=total_training_cycle,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
    num_train_steps = int(np.ceil(
        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
        (1 + FLAGS.num_neg) / FLAGS.batch_size))
    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
                                 ncf_dataset.num_users / eval_batch_size))

  model_helpers.apply_clean(flags.FLAGS)

  params = {
      "use_seed": FLAGS.seed is not None,
      "hash_pipeline": FLAGS.hash_pipeline,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": FLAGS.learning_rate,
      "num_users": num_users,
      "num_items": num_items,
      "mf_dim": FLAGS.num_factors,
      "model_layers": [int(layer) for layer in FLAGS.layers],
      "mf_regularization": FLAGS.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
      "num_neg": FLAGS.num_neg,
      "use_tpu": FLAGS.tpu is not None,
      "tpu": FLAGS.tpu,
      "tpu_zone": FLAGS.tpu_zone,
      "tpu_gcp_project": FLAGS.tpu_gcp_project,
      "beta1": FLAGS.beta1,
      "beta2": FLAGS.beta2,
      "epsilon": FLAGS.epsilon,
      "match_mlperf": FLAGS.ml_perf,
      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
      "use_estimator": FLAGS.use_estimator,
  }
  if FLAGS.use_estimator:
    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
        iterations=num_train_steps, params=params,
        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
  else:
    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
                                         num_eval_steps, FLAGS.use_while_loop)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)


  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    # Train the model
    if FLAGS.use_estimator:
      train_input_fn, train_record_dir, batch_count = \
        data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=True)

      if batch_count != num_train_steps:
        raise ValueError(
            "Step counts do not match. ({} vs. {}) The async process is "
            "producing incorrect shards.".format(batch_count, num_train_steps))

      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                            steps=num_train_steps)
      if train_record_dir:
        tf.gfile.DeleteRecursively(train_record_dir)

      tf.logging.info("Beginning evaluation.")
      if eval_input_fn is None:
        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=False)

        if eval_batch_count != num_eval_steps:
          raise ValueError(
              "Step counts do not match. ({} vs. {}) The async process is "
              "producing incorrect shards.".format(
                  eval_batch_count, num_eval_steps))

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = eval_estimator.evaluate(eval_input_fn,
                                             steps=num_eval_steps)
      tf.logging.info("Evaluation complete.")
    else:
      runner.train()
      tf.logging.info("Beginning evaluation.")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = runner.eval()
      tf.logging.info("Evaluation complete.")
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    # Logged by the async process during record creation.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            deferred=True)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemple #13
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)
        movielens_dataset.construct_train_eval_csv(data_dir=FLAGS.data_dir,
                                                   dataset=FLAGS.dataset)

    tf.logging.info("Data preprocessing...")
    ncf_dataset = movielens_dataset.data_preprocessing(FLAGS.data_dir,
                                                       FLAGS.dataset,
                                                       FLAGS.num_neg)

    model_helpers.apply_clean(flags.FLAGS)

    # Create NeuMF model and convert it to Estimator
    tf.logging.info("Creating Estimator from Keras model...")
    layers = [int(layer) for layer in FLAGS.layers]
    mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
    keras_model = neumf_model.NeuMF(ncf_dataset.num_users,
                                    ncf_dataset.num_items, FLAGS.num_factors,
                                    layers, FLAGS.batch_size,
                                    FLAGS.mf_regularization,
                                    mlp_regularization)
    num_gpus = flags_core.get_num_gpus(FLAGS)
    estimator = convert_keras_to_estimator(keras_model, num_gpus,
                                           FLAGS.model_dir)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    # Training and evaluation cycle
    def get_train_input_fn():
        return movielens_dataset.get_input_fn(
            True,
            distribution_utils.per_device_batch_size(FLAGS.batch_size,
                                                     num_gpus), ncf_dataset,
            FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)

    def get_pred_input_fn():
        return movielens_dataset.get_input_fn(
            False,
            distribution_utils.per_device_batch_size(FLAGS.batch_size,
                                                     num_gpus), ncf_dataset,
            FLAGS.data_dir, FLAGS.dataset, 1)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks)

        # Evaluate the model
        eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus,
                                      ncf_dataset, get_pred_input_fn())

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
def run_keras_model_benchmark(_):
    """Run the benchmark on keras model."""
    # Ensure a valid model name was supplied via command line argument
    if FLAGS.model not in MODELS.keys():
        raise AssertionError("The --model command line argument should "
                             "be a key in the `MODELS` dictionary.")

    # Check if eager execution is enabled
    if FLAGS.eager:
        tf.logging.info("Eager execution is enabled...")
        tf.enable_eager_execution()

    # Load the model
    tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
    keras_model = MODELS[FLAGS.model]

    # Get dataset
    dataset_name = "ImageNet"
    if FLAGS.use_synthetic_data:
        tf.logging.info("Using synthetic dataset...")
        dataset_name += "_Synthetic"
        train_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
        val_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
        model = keras_model(weights=None)
    else:
        tf.logging.info("Using CIFAR-10 dataset...")
        dataset_name = "CIFAR-10"
        ds = dataset.Cifar10Dataset(FLAGS.batch_size)
        train_dataset = ds.train_dataset
        val_dataset = ds.test_dataset
        model = keras_model(weights=None,
                            input_shape=ds.input_shape,
                            classes=ds.num_classes)

    num_gpus = flags_core.get_num_gpus(FLAGS)

    distribution = None
    # Use distribution strategy
    if FLAGS.dist_strat:
        distribution = distribution_utils.get_distribution_strategy(
            num_gpus=num_gpus)
    elif num_gpus > 1:
        # Run with multi_gpu_model
        # If eager execution is enabled, only one GPU is utilized even if multiple
        # GPUs are provided.
        if FLAGS.eager:
            tf.logging.warning(
                "{} GPUs are provided, but only one GPU is utilized as "
                "eager execution is enabled.".format(num_gpus))
        model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

    # Adam optimizer and some other optimizers doesn't work well with
    # distribution strategy (b/113076709)
    # Use GradientDescentOptimizer here
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"],
                  distribute=distribution)

    # Create benchmark logger for benchmark logging
    run_params = {
        "batch_size": FLAGS.batch_size,
        "synthetic_data": FLAGS.use_synthetic_data,
        "train_epochs": FLAGS.train_epochs,
        "num_train_images": FLAGS.num_train_images,
        "num_eval_images": FLAGS.num_eval_images,
    }

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name=FLAGS.model,
                                  dataset_name=dataset_name,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    # Create callbacks that log metric values about the training and evaluation
    callbacks = model_callbacks.get_model_callbacks(
        FLAGS.callbacks,
        batch_size=FLAGS.batch_size,
        metric_logger=benchmark_logger)
    # Train and evaluate the model
    history = model.fit(train_dataset,
                        epochs=FLAGS.train_epochs,
                        callbacks=callbacks,
                        validation_data=val_dataset,
                        steps_per_epoch=int(
                            np.ceil(FLAGS.num_train_images /
                                    FLAGS.batch_size)),
                        validation_steps=int(
                            np.ceil(FLAGS.num_eval_images / FLAGS.batch_size)))

    tf.logging.info("Logging the evaluation results...")
    for epoch in range(FLAGS.train_epochs):
        eval_results = {
            "accuracy":
            history.history["val_acc"][epoch],
            "loss":
            history.history["val_loss"][epoch],
            tf.GraphKeys.GLOBAL_STEP:
            (epoch + 1) * np.ceil(FLAGS.num_eval_images / FLAGS.batch_size)
        }
        benchmark_logger.log_evaluation_result(eval_results)

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
def run_keras_model_benchmark(_):
  """Run the benchmark on keras model."""
  # Ensure a valid model name was supplied via command line argument
  if FLAGS.model not in MODELS.keys():
    raise AssertionError("The --model command line argument should "
                         "be a key in the `MODELS` dictionary.")

  # Load the model
  tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
  keras_model = MODELS[FLAGS.model]
  model = keras_model(weights=None)

  # Get dataset
  dataset_name = "ImageNet"
  if FLAGS.use_synthetic_data:
    tf.logging.info("Using synthetic dataset...")
    dataset_name += "_Synthetic"
    train_num_images = FLAGS.batch_size
    val_num_images = FLAGS.batch_size
    train_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, train_num_images)
    val_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, val_num_images)
  else:
    raise ValueError("Only synthetic dataset is supported!")

  # If run with multiple GPUs
  num_gpus = flags_core.get_num_gpus(FLAGS)
  if num_gpus > 0:
    model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

  # Configure the model
  model.compile(loss="categorical_crossentropy",
                optimizer="sgd",
                metrics=["accuracy"])

  # Create benchmark logger for benchmark logging
  run_params = {
      "batch_size": FLAGS.batch_size,
      "synthetic_data": FLAGS.use_synthetic_data,
      "train_epochs": FLAGS.train_epochs
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name=FLAGS.model,
      dataset_name=dataset_name,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Create callbacks that log metric values about the training and evaluation
  callbacks = model_callbacks.get_model_callbacks(
      FLAGS.callbacks,
      batch_size=FLAGS.batch_size,
      metric_logger=benchmark_logger)
  # Train and evaluate the model
  history = model.fit(
      train_dataset,
      epochs=FLAGS.train_epochs,
      callbacks=callbacks,
      validation_data=val_dataset,
      steps_per_epoch=int(np.ceil(train_num_images / FLAGS.batch_size)),
      validation_steps=int(np.ceil(val_num_images / FLAGS.batch_size))
  )

  tf.logging.info("Logging the evaluation results...")
  for epoch in range(FLAGS.train_epochs):
    eval_results = {
        "accuracy": history.history["val_acc"][epoch],
        "loss": history.history["val_loss"][epoch],
        tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
            train_num_images/FLAGS.batch_size)
    }
    benchmark_logger.log_evaluation_result(eval_results)

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Exemple #16
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

        if np.abs(approx_train_steps - batch_count) > 1:
            tf.logging.warning(
                "Estimated ({}) and reported ({}) number of batches differ by more "
                "than one".format(approx_train_steps, batch_count))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=batch_count)
        tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        eval_results = eval_estimator.evaluate(pred_input_fn)
        tf.logging.info("Evaluation complete.")

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[rconst.HR_KEY]
        ndcg = eval_results[rconst.NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Exemple #17
0
def main(_):
    # Data preprocessing
    # The file name of training and test dataset
    train_fname = os.path.join(
        FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME)
    test_fname = os.path.join(
        FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME)
    neg_fname = os.path.join(FLAGS.data_dir,
                             FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME)

    assert os.path.exists(train_fname), (
        "Run data_download.py first to download and extract {} dataset".format(
            FLAGS.dataset))

    tf.logging.info("Data preprocessing...")
    ncf_dataset = dataset.data_preprocessing(train_fname, test_fname,
                                             neg_fname, FLAGS.num_neg)

    # Create NeuMF model and convert it to Estimator
    tf.logging.info("Creating Estimator from Keras model...")
    layers = [int(layer) for layer in FLAGS.layers]
    mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
    keras_model = neumf_model.NeuMF(ncf_dataset.num_users,
                                    ncf_dataset.num_items, FLAGS.num_factors,
                                    layers, FLAGS.batch_size,
                                    FLAGS.mf_regularization,
                                    mlp_regularization)
    num_gpus = flags_core.get_num_gpus(FLAGS)
    estimator = convert_keras_to_estimator(keras_model, num_gpus,
                                           FLAGS.model_dir)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.config_benchmark_logger(FLAGS)
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params)

    # Training and evaluation cycle
    def train_input_fn():
        return dataset.input_fn(
            True, per_device_batch_size(FLAGS.batch_size, num_gpus),
            ncf_dataset, FLAGS.epochs_between_evals)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        estimator.train(input_fn=train_input_fn, hooks=train_hooks)

        # Evaluate the model
        eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus,
                                      ncf_dataset)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    # Data preprocessing
    # The file name of training and test dataset
    tf.logging.info("Data preprocessing...")

    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = len(train_speech_dataset.speech_labels)

    # Input shape of each data example:
    # [time_steps (T), feature_bins(F), channel(C)]
    # Channel is set as 1 by default.
    input_shape = (None, train_speech_dataset.num_feature_bins, 1)

    # Create deep speech model and convert it to Estimator
    tf.logging.info("Creating Estimator from Keras model...")
    keras_model = deep_speech_model.DeepSpeech(
        input_shape, flags_obj.rnn_hidden_layers, flags_obj.rnn_type,
        flags_obj.is_bidirectional, flags_obj.rnn_hidden_size,
        flags_obj.rnn_activation, num_classes, flags_obj.use_bias)

    # Convert to estimator
    num_gpus = flags_core.get_num_gpus(flags_obj)
    estimator = convert_keras_to_estimator(keras_model, num_gpus)

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_activation": flags_obj.rnn_activation,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               batch_size=flags_obj.batch_size)

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return dataset.input_fn(per_device_batch_size, train_speech_dataset)

    def input_fn_eval():  # #pylint: disable=unused-variable
        return dataset.input_fn(per_device_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1,
                        total_training_cycle)

        estimator.train(input_fn=input_fn_train, hooks=train_hooks)

        # Evaluate (TODO)
        # tf.logging.info("Starting to evaluate.")

        # eval_results = evaluate_model(
        #     estimator, keras_model, data_set.speech_labels, [], input_fn_eval)

        # benchmark_logger.log_evaluation_result(eval_results)
        # If some evaluation threshold is met
        # Log the HR and NDCG results.
        # wer = eval_results[_WER_KEY]
        # cer = eval_results[_CER_KEY]
        # tf.logging.info(
        #     "Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
        #         cycle_index + 1, wer, cer))
        # if model_helpers.past_stop_threshold(FLAGS.wer_threshold, wer):
        #   break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Exemple #19
0
    def model_fn(features, labels, mode, params):
        """Defines how to train, evaluate and predict from the transformer model."""
        num_devices = flags_core.get_num_gpus(flags_obj)
        consolidation_device = 'gpu:0'
        #    feature_shards, label_shards = replicate_model_fn._split_batch(features, labels, num_devices, device=consolidation_device)

        tower_losses = []
        tower_gradvars = []
        tower_preds = []
        for i in range(num_devices):
            worker_device = '/{}:{}'.format('gpu', i)
            device_setter = local_device_setter(
                ps_device_type='gpu',
                worker_device=worker_device,
                ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
                    num_devices, tf.contrib.training.byte_size_load_fn))
            with tf.variable_scope('model', reuse=bool(i != 0)):
                with tf.name_scope('tower_%d' % i) as name_scope:
                    with tf.device(device_setter):
                        # Create model and get output logits.
                        model = transformer.Transformer(
                            params, mode == tf.estimator.ModeKeys.TRAIN)
                        #logits = model(features, labels)
                        loss, gradvars, preds = _tower_fn(model,
                                                          features,
                                                          labels,
                                                          params=params)
                        tower_losses.append(loss)
                        tower_gradvars.append(gradvars)
                        tower_preds.append(preds)

        # Compute global loss and gradients
        gradvars = []
        with tf.name_scope('gradient_averaging'):
            all_grads = {}
            for grad, var in itertools.chain(*tower_gradvars):
                if grad is not None:
                    all_grads.setdefault(var, []).append(grad)
            for var, grads in six.iteritems(all_grads):
                with tf.device(var.device):
                    if len(grads) == 1:
                        avg_grad = grads[0]
                    else:
                        #            for a in range(len(grads)):
                        #              if len(grads[a]) > 1:
                        #                avg_grad = tf.multiply(tf.add_n(grads[a]), 1. / len(grads[a]))
                        #                gradvars.append((avg_grad, var))
                        avg_grad = tf.multiply(tf.add_n(grads),
                                               1. / len(grads))


#          print("AVG_GRAD: ", avg_grad, "VAR: ", var)
                    gradvars.append((avg_grad, var))

        with tf.device(consolidation_device):
            loss = tf.reduce_mean(tower_losses, name='loss')
            tf.identity(loss, "cross_entropy")
            logits = tf.reduce_mean(tower_preds, axis=0)
            #      logits = tf.concat([l for l in tower_preds], axis=0)
            if mode == tf.estimator.ModeKeys.PREDICT:
                return tf.estimator.EstimatorSpec(
                    tf.estimator.ModeKeys.PREDICT,
                    predictions=logits,
                    export_outputs={
                        "translate": tf.estimator.export.PredictOutput(logits)
                    })

            if mode == tf.estimator.ModeKeys.TRAIN:
                with tf.variable_scope("get_train_op"):
                    print("in get_train_op")
                    learning_rate = get_learning_rate(
                        learning_rate=params["learning_rate"],
                        hidden_size=params["hidden_size"],
                        learning_rate_warmup_steps=params[
                            "learning_rate_warmup_steps"])
                    optimizer = tf.contrib.opt.LazyAdamOptimizer(
                        learning_rate,
                        beta1=params["optimizer_adam_beta1"],
                        beta2=params["optimizer_adam_beta2"],
                        epsilon=params["optimizer_adam_epsilon"])
                    optimizer = tf.train.SyncReplicasOptimizer(
                        optimizer, replicas_to_aggregate=num_devices)
                    sync_hook = optimizer.make_session_run_hook(is_chief)
                    #          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    global_step = tf.train.get_global_step()
                    update_ops = tf.assign(global_step,
                                           global_step + 1,
                                           name='update_global_step')
                    minimize_op = optimizer.apply_gradients(
                        gradvars, global_step=tf.train.get_global_step())
                    train_op = tf.group(minimize_op, update_ops)
                    #train_op = [optimizer.apply_gradients(gradvars, global_step=tf.train.get_global_step())]
                    metric_dict = {"learning_rate": learning_rate}
                    metric_dict["minibatch_loss"] = loss
                    record_scalars(metric_dict)
                    return tf.estimator.EstimatorSpec(
                        mode=mode,
                        loss=loss,
                        training_hooks=[sync_hook],
                        train_op=train_op)
            elif mode == tf.estimator.ModeKeys.EVAL:
                return tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metric_ops=metrics.get_eval_metrics(
                        logits, labels, params))
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.
  """
    if flags_obj.enable_eager:
        tf.enable_eager_execution()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, flags_core.get_num_gpus(flags_obj))

    # pylint: disable=protected-access
    if flags_obj.use_synthetic_data:
        input_fn = keras_common.get_synth_input_fn(
            height=imagenet_main.DEFAULT_IMAGE_SIZE,
            width=imagenet_main.DEFAULT_IMAGE_SIZE,
            num_channels=imagenet_main.NUM_CHANNELS,
            num_classes=imagenet_main.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj))
    else:
        input_fn = imagenet_main.input_fn

    train_input_dataset = input_fn(is_training=True,
                                   data_dir=flags_obj.data_dir,
                                   batch_size=per_device_batch_size,
                                   num_epochs=flags_obj.train_epochs,
                                   parse_record_fn=parse_record_keras)

    eval_input_dataset = input_fn(is_training=False,
                                  data_dir=flags_obj.data_dir,
                                  batch_size=per_device_batch_size,
                                  num_epochs=flags_obj.train_epochs,
                                  parse_record_fn=parse_record_keras)

    strategy = distribution_utils.get_distribution_strategy(
        flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

    strategy_scope = keras_common.get_strategy_scope(strategy)

    with strategy_scope:
        optimizer = keras_common.get_optimizer()
        model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES)

        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['sparse_categorical_accuracy'])

    time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
        learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])

    train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        # Only build the training graph. This reduces memory usage introduced by
        # control flow ops in layers that have different implementations for
        # training and inference (e.g., batch norm).
        tf.keras.backend.set_learning_phase(1)
        num_eval_steps = None
        validation_data = None

    history = model.fit(
        train_input_dataset,
        epochs=train_epochs,
        steps_per_epoch=train_steps,
        callbacks=[time_callback, lr_callback, tensorboard_callback],
        validation_steps=num_eval_steps,
        validation_data=validation_data,
        verbose=1)

    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=1)
    stats = keras_common.build_stats(history, eval_output, time_callback)
    return stats
def run(flags_obj):
    """Run ResNet Cifar-10 training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    if flags_obj.enable_eager:
        tf.enable_eager_execution()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, flags_core.get_num_gpus(flags_obj))

    if flags_obj.use_synthetic_data:
        input_fn = keras_common.get_synth_input_fn(
            height=cifar_main.HEIGHT,
            width=cifar_main.WIDTH,
            num_channels=cifar_main.NUM_CHANNELS,
            num_classes=cifar_main.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj))
    else:
        input_fn = cifar_main.input_fn

    train_input_dataset = input_fn(is_training=True,
                                   data_dir=flags_obj.data_dir,
                                   batch_size=per_device_batch_size,
                                   num_epochs=flags_obj.train_epochs,
                                   parse_record_fn=parse_record_keras)

    eval_input_dataset = input_fn(is_training=False,
                                  data_dir=flags_obj.data_dir,
                                  batch_size=per_device_batch_size,
                                  num_epochs=flags_obj.train_epochs,
                                  parse_record_fn=parse_record_keras)

    optimizer = keras_common.get_optimizer()
    strategy = distribution_utils.get_distribution_strategy(
        flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

    model = resnet_cifar_model.resnet56(classes=cifar_main.NUM_CLASSES)

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['categorical_accuracy'],
                  distribute=strategy)

    time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
        learning_rate_schedule, cifar_main.NUM_IMAGES['train'])

    train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        num_eval_steps = None
        validation_data = None

    history = model.fit(
        train_input_dataset,
        epochs=train_epochs,
        steps_per_epoch=train_steps,
        callbacks=[time_callback, lr_callback, tensorboard_callback],
        validation_steps=num_eval_steps,
        validation_data=validation_data,
        verbose=1)
    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=1)
    stats = keras_common.build_stats(history, eval_output)
    return stats
Exemple #22
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
    Dict of results of the run.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Ensures flag override logic is only executed if explicitly triggered.
    if flags_obj.tf_gpu_thread_mode:
        override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

    # Creates session config. allow_soft_placement = True, is required for
    # multi-GPU and is not harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

    # Creates a `RunConfig` that checkpoints every 24 hours which essentially
    # results in checkpoints determined only by `epochs_between_evals`.
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config,
                                        save_checkpoints_secs=60 * 60 * 24)

    # Initializes model with all but the dense layer from pretrained ResNet.
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj),
            'fine_tune': flags_obj.fine_tune
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    train_hooks = list(train_hooks) + lottery.hooks_from_flags(
        flags_obj.flag_values_dict())

    def input_fn_train(num_epochs):
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            dtype=flags_core.get_tf_dtype(flags_obj),
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            num_parallel_batches=flags_obj.datasets_num_parallel_batches)

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    if flags_obj.lth_generate_predictions:
        ckpt = tf.train.latest_checkpoint(flags_obj.model_dir)

        if flags_obj.lth_no_pruning:
            m_hooks = []
        else:
            m_hooks = lottery.hooks_from_flags(flags_obj.flag_values_dict())

        eval_results = classifier.predict(
            input_fn=input_fn_eval,
            checkpoint_path=ckpt,
            hooks=m_hooks,
        )

        assert flags_obj.lth_prediction_result_dir
        with tf.gfile.Open(os.path.join(flags_obj.data_dir, 'test_batch.bin'),
                           'rb') as f:
            labels = list(f.read()[::32 * 32 * 3 + 1])

        eval_results = list(eval_results)
        if not tf.gfile.Exists(flags_obj.lth_prediction_result_dir):
            tf.gfile.MakeDirs(flags_obj.lth_prediction_result_dir)
        with tf.gfile.Open(
                os.path.join(flags_obj.lth_prediction_result_dir,
                             'predictions'), 'wb') as f:
            for label, res in zip(labels, eval_results):
                res['label'] = label
            pickle.dump(eval_results, f)
        return

    try:
        cpr = tf.train.NewCheckpointReader(
            tf.train.latest_checkpoint(flags_obj.model_dir))
        current_step = cpr.get_tensor('global_step')
    except:
        current_step = 0

    while current_step < flags_obj.max_train_steps:
        next_checkpoint = min(current_step + 10000, flags_obj.max_train_steps)
        classifier.train(input_fn=lambda: input_fn_train(1000),
                         hooks=train_hooks,
                         max_steps=next_checkpoint)
        current_step = next_checkpoint
        tf.logging.info('Starting to evaluate.')
        eval_results = classifier.evaluate(input_fn=input_fn_eval)
        benchmark_logger.log_evaluation_result(eval_results)

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        export_dtype = flags_core.get_tf_dtype(flags_obj)
        if flags_obj.image_bytes_as_serving_input:
            input_receiver_fn = functools.partial(image_bytes_serving_input_fn,
                                                  shape,
                                                  dtype=export_dtype)
        else:
            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
                shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
        classifier.export_savedmodel(flags_obj.export_dir,
                                     input_receiver_fn,
                                     strip_default_attrs=True)
Exemple #23
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  ### yr
  params['vocab_size_in']=6100
  params['vocab_size_out'] = 25
  #params['vocab_size']='s'
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  mode = tf.estimator.ModeKeys.TRAIN

  ## build ph
  inputs_ph = tf.placeholder(tf.int32, shape=(None, None), name='inputs')
  targets_ph = tf.placeholder(tf.int32, shape=(None, None), name='targets')
  targets_ph_2 = tf.placeholder(tf.int32, shape=(None, None), name='targets2')

  loss_M, train_op_M = model_fn(inputs_ph, targets_ph, targets_ph_2, mode, params)


  print('Using GPU in Decoder')
  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
  sess = tf.Session(
          config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options))
  with sess.as_default():
      sess.run(tf.global_variables_initializer())
      saver = tf.train.Saver()

      if os.path.exists(os.path.join(FLAGS.model_dir, "checkpoint")):
          saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))


      cnt=0
      for ii in xrange(schedule_manager.train_eval_iterations):
          tf.logging.info("Starting iteration %d" % (ii + 1))
          ## get data
          random.shuffle(datall)
          for data in datall:
                feed = {inputs_ph:data['inputs'],
                      targets_ph: data['targets'],
                      targets_ph_2:data['targets2']}


                loss, train_op = sess.run([loss_M, train_op_M], feed_dict=feed)
                cnt+=1
                print loss

                    ##
                if cnt%100==0:
                    print 'loss at %d'%cnt,loss

                if cnt%2000==0:
                    filename = os.path.join(
                    FLAGS.model_dir, "model_{}.ckpt".format(cnt))
                    saver.save(sess, filename)
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # added below to override the learning rate, warmup steps, max_length, vocab_size in parameter file.
  # params would now have data passed as a flag 

  params["learning_rate"] = flags_obj.learning_rate
  params["learning_rate_warmup_steps"] = flags_obj.learning_rate_warmup_steps
  params["max_length"] = flags_obj.max_length
  params["vocab_size"] = flags_obj.vocab_size

  # added for selecting the learning rate scheme
  params["lr_scheme"] =  flags_obj.lr_scheme
  params["warmup_init_lr"] = flags_obj.warmup_init_lr

  # added for selecting the optimizer algorithm
  params["opt_alg"] = flags_obj.opt_alg

  # added to provide optimizer parameters
  params["optimizer_sgd_momentum"] = flags_obj.optimizer_sgd_momentum
  params["optimizer_rms_decay"] = flags_obj.optimizer_rms_decay
  params["optimizer_rms_momentum"] = flags_obj.optimizer_rms_momentum
  params["optimizer_rms_epsilon"] = flags_obj.optimizer_rms_epsilon
  
  # added to overide layer_postprocess_dropout value
  params["layer_postprocess_dropout"] = flags_obj.layer_postprocess_dropout

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  # commented below to remove distribution strategy
  """
  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)"""

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )

  # added for horovod
  bcast_hook = hvd.BroadcastGlobalVariablesHook(0)


  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # added for horovod
  train_hooks.append(bcast_hook)


  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)


  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file})
Exemple #25
0
def resnet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

  # Using the Winograd non-fused algorithms provides a small performance boost.
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

  # Create session config based on values of inter_op_parallelism_threads and
  # intra_op_parallelism_threads. Note that we default to having
  # allow_soft_placement = True, which is required for multi-GPU and not
  # harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  if flags_core.get_num_gpus(flags_obj) == 0:
    distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
  elif flags_core.get_num_gpus(flags_obj) == 1:
    distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
  else:
    distribution = tf.contrib.distribute.MirroredStrategy(
        num_gpus=flags_core.get_num_gpus(flags_obj)
    )

  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      session_config=session_config)

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      params={
          'resnet_size': int(flags_obj.resnet_size),
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'resnet_version': int(flags_obj.resnet_version),
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj)
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'resnet_size': flags_obj.resnet_size,
      'resnet_version': flags_obj.resnet_version,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  benchmark_logger = logger.config_benchmark_logger(flags_obj)
  benchmark_logger.log_run_info('resnet', dataset_name, run_params)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      batch_size=flags_obj.batch_size)

  def input_fn_train():
    return input_function(
        is_training=True, data_dir=flags_obj.data_dir,
        batch_size=per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=flags_obj.epochs_between_evals)

  def input_fn_eval():
    return input_function(
        is_training=False, data_dir=flags_obj.data_dir,
        batch_size=per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info('Starting a training cycle: %d/%d',
                    cycle_index, total_training_cycle)

    classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                     max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
        shape, batch_size=flags_obj.batch_size)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
Exemple #26
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)
    eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        if not FLAGS.inference_only:
            # Train the model
            train_input_fn, train_record_dir, batch_count = \
                data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

            if np.abs(approx_train_steps - batch_count) > 1:
                tf.logging.warning(
                    "Estimated ({}) and reported ({}) number of batches differ by more "
                    "than one".format(approx_train_steps, batch_count))
            train_estimator.train(input_fn=train_input_fn,
                                  hooks=train_hooks,
                                  steps=batch_count)
            tf.gfile.DeleteRecursively(train_record_dir)

        # Evaluate the model
        eval_results = evaluate_model(eval_estimator, ncf_dataset,
                                      pred_input_fn)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # Export SavedModel
        if FLAGS.export_savedmodel:
            eval_estimator.export_savedmodel(FLAGS.model_dir,
                                             serving_input_receiver_fn)
            print("SavedModel successfully exported to: {}/<timestamp>".format(
                FLAGS.model_dir))

        # Some of the NumPy vector math can be quite large and likes to stay in
        # memory for a while.
        gc.collect()

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Exemple #27
0
def run_keras_model_benchmark(_):
  """Run the benchmark on keras model."""
  # Ensure a valid model name was supplied via command line argument
  if FLAGS.model not in MODELS.keys():
    raise AssertionError("The --model command line argument should "
                         "be a key in the `MODELS` dictionary.")

  # Check if eager execution is enabled
  if FLAGS.eager:
    tf.logging.info("Eager execution is enabled...")
    tf.enable_eager_execution()

  # Load the model
  tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
  keras_model = MODELS[FLAGS.model]
  model = keras_model(weights=None)

  # Get dataset
  dataset_name = "ImageNet"
  if FLAGS.use_synthetic_data:
    tf.logging.info("Using synthetic dataset...")
    dataset_name += "_Synthetic"
    train_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, FLAGS.batch_size)
    val_dataset = dataset.generate_synthetic_input_dataset(
        FLAGS.model, FLAGS.batch_size)
  else:
    raise ValueError("Only synthetic dataset is supported!")

  num_gpus = flags_core.get_num_gpus(FLAGS)

  distribution = None
  # Use distribution strategy
  if FLAGS.dist_strat:
    distribution = distribution_utils.get_distribution_strategy(
        num_gpus=num_gpus)
  elif num_gpus > 1:
    # Run with multi_gpu_model
    # If eager execution is enabled, only one GPU is utilized even if multiple
    # GPUs are provided.
    if FLAGS.eager:
      tf.logging.warning(
          "{} GPUs are provided, but only one GPU is utilized as "
          "eager execution is enabled.".format(num_gpus))
    model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

  # Adam optimizer and some other optimizers doesn't work well with
  # distribution strategy (b/113076709)
  # Use GradientDescentOptimizer here
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
  model.compile(loss="categorical_crossentropy",
                optimizer=optimizer,
                metrics=["accuracy"],
                distribute=distribution)

  # Create benchmark logger for benchmark logging
  run_params = {
      "batch_size": FLAGS.batch_size,
      "synthetic_data": FLAGS.use_synthetic_data,
      "train_epochs": FLAGS.train_epochs,
      "num_train_images": FLAGS.num_train_images,
      "num_eval_images": FLAGS.num_eval_images,
  }

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name=FLAGS.model,
      dataset_name=dataset_name,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Create callbacks that log metric values about the training and evaluation
  callbacks = model_callbacks.get_model_callbacks(
      FLAGS.callbacks,
      batch_size=FLAGS.batch_size,
      metric_logger=benchmark_logger)
  # Train and evaluate the model
  history = model.fit(
      train_dataset,
      epochs=FLAGS.train_epochs,
      callbacks=callbacks,
      validation_data=val_dataset,
      steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
      validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
  )

  tf.logging.info("Logging the evaluation results...")
  for epoch in range(FLAGS.train_epochs):
    eval_results = {
        "accuracy": history.history["val_acc"][epoch],
        "loss": history.history["val_loss"][epoch],
        tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
            FLAGS.num_eval_images/FLAGS.batch_size)
    }
    benchmark_logger.log_evaluation_result(eval_results)

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Exemple #28
0
    def __init__(self, flags_obj):
        """Init function

    Args:
      flags_obj: Object containing parsed flag values, i.e., FLAGS.
    """
        self.flags_obj = flags_obj

        # Add flag-defined parameters to params object
        num_gpus = flags_core.get_num_gpus(flags_obj)
        self.params = params = misc.get_model_params(flags_obj.param_set,
                                                     num_gpus)
        params["num_gpus"] = num_gpus
        params["data_dir"] = flags_obj.data_dir
        params["val_data_dir"] = flags_obj.val_data_dir
        params["model_dir"] = flags_obj.model_dir
        params["static_batch"] = flags_obj.static_batch
        params["max_input_length"] = flags_obj.max_input_length
        params["max_target_length"] = flags_obj.max_target_length
        params["decode_batch_size"] = flags_obj.decode_batch_size
        params["decode_max_length"] = flags_obj.decode_max_length
        params["padded_decode"] = flags_obj.padded_decode
        params["num_parallel_calls"] = (flags_obj.num_parallel_calls
                                        or tf.data.experimental.AUTOTUNE)

        params["use_synthetic_data"] = flags_obj.use_synthetic_data
        params["batch_size"] = flags_obj.batch_size * max(num_gpus, 1)
        logging.info('actual batch_size = {} * {}'.format(
            flags_obj.batch_size, max(num_gpus, 1)))
        params["repeat_dataset"] = None
        params["dtype"] = flags_core.get_tf_dtype(flags_obj)
        params[
            "enable_metrics_in_training"] = flags_obj.enable_metrics_in_training
        params["num_hashes"] = flags_obj.num_hashes
        params["test_num_hashes"] = flags_obj.test_num_hashes
        params[
            "use_full_attention_in_reformer"] = flags_obj.use_full_attention_in_reformer
        params["bucket_size"] = flags_obj.bucket_size

        if flags_obj.one_dropout is not None:
            params['layer_postprocess_dropout'] = flags_obj.one_dropout
            params['attention_dropout'] = flags_obj.one_dropout
            params['relu_dropout'] = flags_obj.one_dropout
        if flags_obj.attention_dropout is not None:
            params['attention_dropout'] = flags_obj.attention_dropout
        params['lsh_attention_dropout'] = params['attention_dropout'] if params[
            "use_full_attention_in_reformer"] else flags_obj.lsh_attention_dropout
        logging.info(
            f'dropouts (postprocess, attention, lsh_attention, relu) = {[params[k] for k in ["layer_postprocess_dropout", "attention_dropout", "lsh_attention_dropout", "relu_dropout"]]}'
        )
        logging.info(
            f'attention_padding_strategy = {flags_obj.attention_padding_strategy}'
        )

        assert self.flags_obj.vocab_file, 'vocab file is None'
        self.tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(
            self.flags_obj.vocab_file)
        self.EOS_id = self.tokenizer.encode('<EOS>')[0]
        params["vocab_size"] = self.tokenizer.vocab_size
        logging.info(
            'loaded vocab from {}, vocab_size={} and EOS_id={}'.format(
                self.flags_obj.vocab_file, self.tokenizer.vocab_size,
                self.EOS_id))
        logging.info(f'training_schema = [{self.flags_obj.training_schema}]')

        if params["dtype"] == tf.float16:
            # TODO(reedwm): It's pretty ugly to set the global policy in a constructor
            # like this. What if multiple instances of Seq2SeqTask are created?
            # We should have a better way in the tf.keras.mixed_precision API of doing
            # this.
            loss_scale = flags_core.get_loss_scale(flags_obj,
                                                   default_for_fp16="dynamic")
            policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
                "mixed_float16", loss_scale=loss_scale)
            tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

        self.distribution_strategy = distribution_utils.get_distribution_strategy(
            distribution_strategy=flags_obj.distribution_strategy,
            num_gpus=num_gpus,
            tpu_address=flags_obj.tpu or "")
        logging.info("Running dtitle model with num_gpus = %d", num_gpus)

        if self.distribution_strategy:
            logging.info("For training, using distribution strategy: %s",
                         self.distribution_strategy)
        else:
            logging.info("Not using any distribution strategy.")
Exemple #29
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                percent,
                model_class,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
     Dict of results of the run.  Contains the keys `eval_results` and
    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
    `train_hooks` is a list the instances of hooks used during training.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Ensures flag override logic is only executed if explicitly triggered.
    if flags_obj.tf_gpu_thread_mode:
        override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

    # Configures cluster spec for distribution strategy.
    num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
                                                       flags_obj.task_index)

    # Creates session config. allow_soft_placement = True, is required for
    # multi-GPU and is not harmful for other modes.
    session_config = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_core.get_num_gpus(flags_obj),
        num_workers=num_workers,
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs)

    # Creates a `RunConfig` that checkpoints every 24 hours which essentially
    # results in checkpoints determined only by `epochs_between_evals`.
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config,
                                        save_checkpoints_secs=60 * 60 * 24,
                                        save_checkpoints_steps=None)

    # Initializes model with all but the dense layer from pretrained ResNet.
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None
    params = {
        'resnet_size': int(flags_obj.resnet_size),
        'data_format': 'channels_last',
        'batch_size': flags_obj.batch_size,
        'resnet_version': int(flags_obj.resnet_version),
        'loss_scale': flags_core.get_loss_scale(flags_obj,
                                                default_for_fp16=128),
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'fine_tune': flags_obj.fine_tune,
        'num_workers': num_workers,
        'adv_train': False,
        'attack': False,
    }
    classifier = tf.compat.v1.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params=params)
    params['adv_train'] = True
    classifier_adv = tf.compat.v1.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params=params)
    params['adv_train'] = False
    params['attack'] = True
    classifier_attack = tf.compat.v1.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params=params)
    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
        'num_workers': num_workers,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs, input_context=None):
        return input_function(
            is_training=True,
            percent=percent,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_replica_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            dtype=flags_core.get_tf_dtype(flags_obj),
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            input_context=input_context)

    def input_fn_eval():
        return input_function(
            is_training=False,
            percent=0,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_replica_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    def input_fn_eval_attack():
        return input_function(
            is_training=False,
            percent=100,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_replica_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
                    flags_obj.train_epochs)
    #   tf.compat.v1.logging.info(tf.global_variables())
    use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
    if use_train_and_evaluate:
        train_spec = tf.estimator.TrainSpec(
            input_fn=lambda input_context=None: input_fn_train(
                train_epochs, input_context=input_context),
            hooks=train_hooks,
            max_steps=flags_obj.max_train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
        tf.compat.v1.logging.info('Starting to train and evaluate.')
        tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
        # tf.estimator.train_and_evalute doesn't return anything in multi-worker
        # case.
        eval_results = {}
    else:
        if train_epochs == 0:
            # If --eval_only is set, perform a single loop with zero train epochs.
            schedule, n_loops = [0], 1
        else:
            # Compute the number of times to loop while training. All but the last
            # pass will train for `epochs_between_evals` epochs, while the last will
            # train for the number needed to reach `training_epochs`. For instance if
            #   train_epochs = 25 and epochs_between_evals = 10
            # schedule will be set to [10, 10, 5]. That is to say, the loop will:
            #   Train for 10 epochs and then evaluate.
            #   Train for another 10 epochs and then evaluate.
            #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
            n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
            schedule = [
                flags_obj.epochs_between_evals for _ in range(int(n_loops))
            ]
            schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.

        for cycle_index, num_train_epochs in enumerate(schedule):
            tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
                                      int(n_loops))

            if num_train_epochs:
                # Since we are calling classifier.train immediately in each loop, the
                # value of num_train_epochs in the lambda function will not be changed
                # before it is used. So it is safe to ignore the pylint error here
                # pylint: disable=cell-var-from-loop
                if flags_obj.adv_train:

                    classifier_adv.train(
                        input_fn=lambda input_context=None: input_fn_train(
                            num_train_epochs, input_context=input_context),
                        hooks=train_hooks,
                        max_steps=flags_obj.max_train_steps)
                else:
                    classifier.train(
                        input_fn=lambda input_context=None: input_fn_train(
                            num_train_epochs, input_context=input_context),
                        hooks=train_hooks,
                        max_steps=flags_obj.max_train_steps)

            # flags_obj.max_train_steps is generally associated with testing and
            # profiling. As a result it is frequently called with synthetic data,
            # which will iterate forever. Passing steps=flags_obj.max_train_steps
            # allows the eval (which is generally unimportant in those circumstances)
            # to terminate.  Note that eval will run for max_train_steps each loop,
            # regardless of the global_step count.
            tf.compat.v1.logging.info('Starting to evaluate clean.')
            eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                               steps=flags_obj.max_train_steps)
            tf.compat.v1.logging.info('Starting to evaluate adv.')
            eval_results_adv = classifier_adv.evaluate(
                input_fn=input_fn_eval, steps=flags_obj.max_train_steps)
            tf.compat.v1.logging.info('Starting to evaluate attack.')
            eval_results_attack = classifier_attack.evaluate(
                input_fn=input_fn_eval_attack, steps=flags_obj.max_train_steps)
            print(
                '########################## clean #############################'
            )
            benchmark_logger.log_evaluation_result(eval_results)
            print(
                '########################## adv #############################')
            benchmark_logger.log_evaluation_result(eval_results_adv)
            print(
                '########################## attack #############################'
            )
            benchmark_logger.log_evaluation_result(eval_results_attack)

            if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                                 eval_results['accuracy']):
                break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        export_dtype = flags_core.get_tf_dtype(flags_obj)
        if flags_obj.image_bytes_as_serving_input:
            input_receiver_fn = functools.partial(image_bytes_serving_input_fn,
                                                  shape,
                                                  dtype=export_dtype)
        else:
            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
                shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
        classifier.export_savedmodel(flags_obj.export_dir,
                                     input_receiver_fn,
                                     strip_default_attrs=True)

    stats = {}
    stats['eval_results'] = eval_results
    stats['eval_atttack_results'] = eval_results_attack
    stats['eval_adv_results'] = eval_results_adv
    stats['train_hooks'] = train_hooks

    return stats
Exemple #30
0
def run_mnist(flags_obj):
  """Run MNIST training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  model_helpers.apply_clean(flags_obj)
  model_function = model_fn

  # Get number of GPUs as defined by the --num_gpus flags and the number of
  # GPUs available on the machine.
  num_gpus = flags_core.get_num_gpus(flags_obj)
  multi_gpu = num_gpus > 1

  if multi_gpu:
    # Validate that the batch size can be split into devices.
    distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus)

    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
    # and (2) wrap the optimizer. The first happens here, and (2) happens
    # in the model_fn itself when the optimizer is defined.
    model_function = tf.contrib.estimator.replicate_model_fn(
        model_fn, loss_reduction=tf.losses.Reduction.MEAN,
        devices=["/device:GPU:%d" % d for d in range(num_gpus)])

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
      model_fn=model_function,
      model_dir=flags_obj.model_dir,
      params={
          'data_format': data_format,
          'multi_gpu': multi_gpu
      })

  # Set up training and evaluation input functions.
  def train_input_fn():
    """Prepare data for training."""

    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes use less memory. MNIST is a small
    # enough dataset that we can easily shuffle the full epoch.
    ds = dataset.train(flags_obj.data_dir)
    ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

    # Iterate through the dataset a set number (`epochs_between_evals`) of times
    # during each training session.
    ds = ds.repeat(flags_obj.epochs_between_evals)
    return ds

  def eval_input_fn():
    return dataset.test(flags_obj.data_dir).batch(
        flags_obj.batch_size).make_one_shot_iterator().get_next()

  # Set up hook that outputs training logs every 100 steps.
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  # Train and evaluate model.
  for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('\nEvaluation results:\n\t%s\n' % eval_results)

    if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                         eval_results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    image = tf.placeholder(tf.float32, [None, 28, 28])
    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
        'image': image,
    })
    mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)
Exemple #31
0
    def __init__(self, flags_obj):
        """Init function of TransformerMain.

    Args:
      flags_obj: Object containing parsed flag values, i.e., FLAGS.

    Raises:
      ValueError: if not using static batch for input data on TPU.
    """
        self.flags_obj = flags_obj
        self.predict_model = None

        # Add flag-defined parameters to params object
        num_gpus = flags_core.get_num_gpus(flags_obj)
        self.params = params = misc.get_model_params(flags_obj.param_set,
                                                     num_gpus)

        params["num_gpus"] = num_gpus
        params["use_ctl"] = flags_obj.use_ctl
        params["data_dir"] = flags_obj.data_dir
        params["model_dir"] = flags_obj.model_dir
        params["static_batch"] = flags_obj.static_batch
        params["max_length"] = flags_obj.max_length
        params["decode_batch_size"] = flags_obj.decode_batch_size
        params["decode_max_length"] = flags_obj.decode_max_length
        params["padded_decode"] = flags_obj.padded_decode
        params["num_parallel_calls"] = (flags_obj.num_parallel_calls
                                        or tf.data.experimental.AUTOTUNE)

        params["use_synthetic_data"] = flags_obj.use_synthetic_data
        params["batch_size"] = flags_obj.batch_size or params[
            "default_batch_size"]
        params["repeat_dataset"] = None
        params["dtype"] = flags_core.get_tf_dtype(flags_obj)
        params["enable_tensorboard"] = flags_obj.enable_tensorboard
        params[
            "enable_metrics_in_training"] = flags_obj.enable_metrics_in_training
        params["steps_between_evals"] = flags_obj.steps_between_evals
        params["enable_checkpointing"] = flags_obj.enable_checkpointing

        self.distribution_strategy = distribution_utils.get_distribution_strategy(
            distribution_strategy=flags_obj.distribution_strategy,
            num_gpus=num_gpus,
            all_reduce_alg=flags_obj.all_reduce_alg,
            num_packs=flags_obj.num_packs,
            tpu_address=flags_obj.tpu or "")
        if self.use_tpu:
            params[
                "num_replicas"] = self.distribution_strategy.num_replicas_in_sync
            if not params["static_batch"]:
                raise ValueError("TPU requires static batch for input data.")
        else:
            logging.info("Running transformer with num_gpus = %d", num_gpus)

        if self.distribution_strategy:
            logging.info("For training, using distribution strategy: %s",
                         self.distribution_strategy)
        else:
            logging.info("Not using any distribution strategy.")

        performance.set_mixed_precision_policy(
            params["dtype"],
            flags_core.get_loss_scale(flags_obj, default_for_fp16="dynamic"))
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file},
        strip_default_attrs=True)
Exemple #33
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
  ncf_dataset = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
      batch_size=batch_size,
      eval_batch_size=eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=FLAGS.epochs_between_evals,
      match_mlperf=FLAGS.ml_perf)

  model_helpers.apply_clean(flags.FLAGS)

  train_estimator, eval_estimator = construct_estimator(
      num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
          "batch_size": batch_size,
          "learning_rate": FLAGS.learning_rate,
          "num_users": ncf_dataset.num_users,
          "num_items": ncf_dataset.num_items,
          "mf_dim": FLAGS.num_factors,
          "model_layers": [int(layer) for layer in FLAGS.layers],
          "mf_regularization": FLAGS.mf_regularization,
          "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
          "use_tpu": FLAGS.tpu is not None,
          "tpu": FLAGS.tpu,
          "tpu_zone": FLAGS.tpu_zone,
          "tpu_gcp_project": FLAGS.tpu_gcp_project,
      }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  approx_train_steps = int(ncf_dataset.num_train_positives
                           * (1 + FLAGS.num_neg) // FLAGS.batch_size)
  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))


    # Train the model
    train_input_fn, train_record_dir, batch_count = \
      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

    if np.abs(approx_train_steps - batch_count) > 1:
      tf.logging.warning(
          "Estimated ({}) and reported ({}) number of batches differ by more "
          "than one".format(approx_train_steps, batch_count))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                          steps=batch_count)
    tf.gfile.DeleteRecursively(train_record_dir)

    # Evaluate the model
    eval_results = evaluate_model(
        eval_estimator, ncf_dataset, pred_input_fn)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # Some of the NumPy vector math can be quite large and likes to stay in
    # memory for a while.
    gc.collect()

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results`,
    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
    instances of hooks used during training.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["max_length"] = flags_obj.max_length or params['max_length']

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    total_batch_size = params["batch_size"]
    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_replica_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=total_batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    stats = run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir and not params["use_tpu"]:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
    return stats
def run_mnist(flags_obj):
    """Run MNIST training and eval loop.
  Args:
    flags_obj: An object containing parsed flag values.
  """
    model_helpers.apply_clean(flags_obj)
    model_function = model_fn

    # Get number of GPUs as defined by the --num_gpus flags and the number of
    # GPUs available on the machine.
    num_gpus = flags_core.get_num_gpus(flags_obj)
    multi_gpu = num_gpus > 1

    if multi_gpu:
        # Validate that the batch size can be split into devices.
        distribution_utils.per_device_batch_size(flags_obj.batch_size,
                                                 num_gpus)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_fn,
            loss_reduction=tf.losses.Reduction.MEAN,
            devices=["/device:GPU:%d" % d for d in range(num_gpus)])

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    mnist_classifier = tf.estimator.Estimator(model_fn=model_function,
                                              model_dir=flags_obj.model_dir,
                                              params={
                                                  'data_format': data_format,
                                                  'multi_gpu': multi_gpu
                                              })

    # Set up training and evaluation input functions.
    def train_input_fn():
        """Prepare data for training."""

        # When choosing shuffle buffer sizes, larger sizes result in better
        # randomness, while smaller sizes use less memory. MNIST is a small
        # enough dataset that we can easily shuffle the full epoch.
        ds = dataset.train(flags_obj.data_dir)
        ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

        # Iterate through the dataset a set number (`epochs_between_evals`) of times
        # during each training session.
        ds = ds.repeat(flags_obj.epochs_between_evals)
        return ds

    def eval_input_fn():
        return dataset.test(flags_obj.data_dir).batch(
            flags_obj.batch_size).make_one_shot_iterator().get_next()

    # Set up hook that outputs training logs every 100 steps.
    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    # Train and evaluate model.
    for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
        mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
        eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
        print('\nEvaluation results:\n\t%s\n' % eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    # Export the model
    if flags_obj.export_dir is not None:
        image = tf.placeholder(tf.float32, [None, 28, 28])
        input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
            'image':
            image,
        })
        mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)
Exemple #36
0
def run_transformer(flags_obj):
    print("run_transformer")
    """Create tf.Estimator to train and evaluate transformer model.
  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["static_batch"] = flags_obj.static_batch
    params["allow_ffn_pad"] = True

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (flags_obj.batch_size
                            or params["default_batch_size"])

    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=False,
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=False  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    #  estimator = construct_estimator(flags_obj, params, schedule_manager)

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        intra_op_parallelism_threads=0,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    print("SESS_CONFIG: ", sess_config)
    config = RunConfig(session_config=sess_config,
                       model_dir=params["model_dir"])
    variable_strategy = 'GPU'
    use_distortion_for_training = True
    experiment_fn = get_experiment_fn(config.is_chief, flags_obj, params,
                                      schedule_manager, num_gpus,
                                      variable_strategy,
                                      use_distortion_for_training)

    #tf.contrib.learn.learn_runner.run(experiment_fn, run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams))

    tf.contrib.learn.learn_runner.run(experiment_fn,
                                      run_config=config,
                                      hparams=tf.contrib.training.HParams(
                                          is_chief=config.is_chief, **params))
    '''
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)
  '''

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
Exemple #37
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    if FLAGS.use_synthetic_data:
        ncf_dataset = None
        cleanup_fn = lambda: None
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_neg=FLAGS.num_neg,
            epochs_per_cycle=FLAGS.epochs_between_evals,
            match_mlperf=FLAGS.ml_perf,
            deterministic=FLAGS.seed is not None,
            use_subprocess=FLAGS.use_subprocess,
            cache_id=FLAGS.cache_id)
        num_users = ncf_dataset.num_users
        num_items = ncf_dataset.num_items
        num_train_steps = int(
            np.ceil(FLAGS.epochs_between_evals *
                    ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) /
                    FLAGS.batch_size))
        num_eval_steps = int(
            np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users /
                    eval_batch_size))

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "eval_batch_size": eval_batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": num_users,
            "num_items": num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
            "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    pred_input_fn = None
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_input_fn(
              ncf_dataset=ncf_dataset, is_training=True)

        if batch_count != num_train_steps:
            raise ValueError(
                "Step counts do not match. ({} vs. {}) The async process is "
                "producing incorrect shards.".format(batch_count,
                                                     num_train_steps))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=num_train_steps)
        if train_record_dir:
            tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        if pred_input_fn is None:
            pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
                ncf_dataset=ncf_dataset, is_training=False)

            if eval_batch_count != num_eval_steps:
                raise ValueError(
                    "Step counts do not match. ({} vs. {}) The async process is "
                    "producing incorrect shards.".format(
                        eval_batch_count, num_eval_steps))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = eval_estimator.evaluate(pred_input_fn,
                                               steps=num_eval_steps)
        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        tf.logging.info("Evaluation complete.")

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        # Logged by the async process during record creation.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                deferred=True)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemple #38
0
  def __init__(self, flags_obj):
    """Init function of TransformerMain.

    Args:
      flags_obj: Object containing parsed flag values, i.e., FLAGS.

    Raises:
      ValueError: if not using static batch for input data on TPU.
    """
    self.flags_obj = flags_obj
    self.predict_model = None

    # Add flag-defined parameters to params object
    num_gpus = flags_core.get_num_gpus(flags_obj)
    self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus)

    params["num_gpus"] = num_gpus
    params["use_ctl"] = flags_obj.use_ctl
    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["static_batch"] = flags_obj.static_batch
    params["max_length"] = flags_obj.max_length
    params["decode_batch_size"] = flags_obj.decode_batch_size
    params["decode_max_length"] = flags_obj.decode_max_length
    params["padded_decode"] = flags_obj.padded_decode
    params["num_parallel_calls"] = (
        flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE)

    params["use_synthetic_data"] = flags_obj.use_synthetic_data
    params["batch_size"] = flags_obj.batch_size or params["default_batch_size"]
    params["repeat_dataset"] = None
    params["dtype"] = flags_core.get_tf_dtype(flags_obj)
    params["enable_metrics_in_training"] = flags_obj.enable_metrics_in_training

    if params["dtype"] == tf.float16:
      # TODO(reedwm): It's pretty ugly to set the global policy in a constructor
      # like this. What if multiple instances of TransformerTask are created?
      # We should have a better way in the tf.keras.mixed_precision API of doing
      # this.
      loss_scale = flags_core.get_loss_scale(flags_obj,
                                             default_for_fp16="dynamic")
      policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
          "mixed_float16", loss_scale=loss_scale)
      tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    if params["dtype"] == tf.bfloat16:
      policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
          "mixed_bfloat16")
      tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    self.distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=num_gpus,
        tpu_address=flags_obj.tpu or "")
    if self.use_tpu:
      params["num_replicas"] = self.distribution_strategy.num_replicas_in_sync
      if not params["static_batch"]:
        raise ValueError("TPU requires static batch for input data.")
    else:
      logging.info("Running transformer with num_gpus = %d", num_gpus)

    if self.distribution_strategy:
      logging.info("For training, using distribution strategy: %s",
                   self.distribution_strategy)
    else:
      logging.info("Not using any distribution strategy.")
Exemple #39
0
def run_keras_model_benchmark(_):
    new_job_thread = threading.Thread(target=receive,
                                      args=(
                                          FLAGS.server_address.split(':')[0],
                                          FLAGS.port,
                                      ),
                                      daemon=True)
    new_job_thread.start()
    """Run the benchmark on keras model."""
    # Ensure a valid model name was supplied via command line argument
    if FLAGS.model not in MODELS.keys():
        raise AssertionError("The --model command line argument should "
                             "be a key in the `MODELS` dictionary.")
    # print(FLAGS.gpus_list)
    # exit()
    # Check if eager execution is enabled
    if FLAGS.eager:
        tf.logging.info("Eager execution is enabled...")
        tf.enable_eager_execution()

    # Load the model
    tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
    keras_model = MODELS[FLAGS.model]
    model = keras_model(weights=None)

    # Get dataset
    dataset_name = "ImageNet"
    if FLAGS.use_synthetic_data:
        tf.logging.info("Using synthetic dataset...")
        dataset_name += "_Synthetic"
        train_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
        val_dataset = dataset.generate_synthetic_input_dataset(
            FLAGS.model, FLAGS.batch_size)
    else:
        raise ValueError("Only synthetic dataset is supported!")

    num_gpus = flags_core.get_num_gpus(FLAGS)

    distribution = None
    # Use distribution strategy
    if FLAGS.dist_strat:
        distribution = distribution_utils.get_distribution_strategy(
            num_gpus=num_gpus)
    elif num_gpus > 1:
        # Run with multi_gpu_model
        # If eager execution is enabled, only one GPU is utilized even if multiple
        # GPUs are provided.
        if FLAGS.eager:
            tf.logging.warning(
                "{} GPUs are provided, but only one GPU is utilized as "
                "eager execution is enabled.".format(num_gpus))
        model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

    # Adam optimizer and some other optimizers doesn't work well with
    # distribution strategy (b/113076709)
    # Use GradientDescentOptimizer here
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"],
                  distribute=distribution)

    # Create benchmark logger for benchmark logging
    run_params = {
        "batch_size": FLAGS.batch_size,
        "synthetic_data": FLAGS.use_synthetic_data,
        "train_epochs": FLAGS.train_epochs,
        "num_train_images": FLAGS.num_train_images,
        "num_eval_images": FLAGS.num_eval_images,
    }

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name=FLAGS.model,
                                  dataset_name=dataset_name,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    class LossHistory(tf.keras.callbacks.Callback):
        def __init__(self):
            self.start = time.time()

        def on_train_begin(self, logs={}):
            return

        def on_epoch_end(self, epoch, logs={}):
            global training_flags, have_trained
            if job_status == 'g':
                training_flags = 1
                have_trained = epoch + 1
                self.model.stop_training = True
            if job_status == 's':
                training_flags = 1
                have_trained = epoch + 1
                self.model.stop_training = True

        def on_batch_end(self, batch, logs={}):
            global lock
            if batch == 49 and lock is True:
                hundred = time.time() - self.start
                # calculate the speed and unlock job
                msg = {}
                msg['id'] = FLAGS.id
                msg['status'] = 'un'
                msg['ep_tm'] = FLAGS.num_train_images * hundred / (
                    FLAGS.batch_size * 50)
                send_msg(FLAGS.server_address, msg)
                lock = False

    # Create callbacks that log metric values about the training and evaluation
    callbacks = model_callbacks.get_model_callbacks(
        FLAGS.callbacks,
        batch_size=FLAGS.batch_size,
        metric_logger=benchmark_logger)
    callbacks.append(LossHistory())
    # Train and evaluate the model
    history = model.fit(
        train_dataset,
        epochs=FLAGS.train_epochs,
        callbacks=callbacks,
        validation_data=val_dataset,
        steps_per_epoch=int(np.ceil(FLAGS.num_train_images /
                                    FLAGS.batch_size)),
    )
    ''' No need for evaluation part
    tf.logging.info("Logging the evaluation results...")
    for epoch in range(FLAGS.train_epochs):
        eval_results = {
                "accuracy": history.history["val_acc"][epoch],
                "loss": history.history["val_loss"][epoch],
                tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
                        FLAGS.num_eval_images/FLAGS.batch_size)
        }
        benchmark_logger.log_evaluation_result(eval_results)
    '''

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
    # Now end the training send back message
    msg = {}
    remain_ep = FLAGS.train_epochs - have_trained
    if training_flags == 0 or remain_ep == 0:
        msg['status'] = 'e'
        msg['id'] = FLAGS.id
        # send_msg(FLAGS.server_address, msg)
    else:
        # ask the scheduler to re-run
        # growing is needed
        gpus_loc = {}
        flags_gpu_list = [int(i) for i in FLAGS.gpus_list]
        if job_status == 'g':
            new_gpus_list = gpus + flags_gpu_list
            msg['status'] = 'g'
        else:
            new_gpus_list = list(set(flags_gpu_list).difference(set(gpus)))
            msg['status'] = 's'
        # TODO hardcoded here
        gpus_loc['localhost'] = new_gpus_list
        msg['gpus_loc'] = gpus_loc
        msg['id'] = FLAGS.id
        msg['ep'] = FLAGS.train_epochs - have_trained
        # send_msg(FLAGS.server_address, msg)

    global exit_code
    exit_code = True
    time.sleep(1)
    send_msg(FLAGS.server_address, msg)
    print('exit')
    exit()
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Ensures flag override logic is only executed if explicitly triggered.
    if flags_obj.tf_gpu_thread_mode:
        override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

    # Creates session config. allow_soft_placement = True, is required for
    # multi-GPU and is not harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

    # Creates a `RunConfig` that checkpoints every 24 hours which essentially
    # results in checkpoints determined only by `epochs_between_evals`.
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config,
                                        save_checkpoints_secs=60 * 60 * 24)

    # Initializes model with all but the dense layer from pretrained ResNet.
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj),
            'fine_tune': flags_obj.fine_tune
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs):
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            dtype=flags_core.get_tf_dtype(flags_obj),
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            num_parallel_batches=flags_obj.datasets_num_parallel_batches)

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    if flags_obj.eval_only or not flags_obj.train_epochs:
        # If --eval_only is set, perform a single loop with zero train epochs.
        schedule, n_loops = [0], 1
    else:
        # Compute the number of times to loop while training. All but the last
        # pass will train for `epochs_between_evals` epochs, while the last will
        # train for the number needed to reach `training_epochs`. For instance if
        #   train_epochs = 25 and epochs_between_evals = 10
        # schedule will be set to [10, 10, 5]. That is to say, the loop will:
        #   Train for 10 epochs and then evaluate.
        #   Train for another 10 epochs and then evaluate.
        #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
        n_loops = math.ceil(flags_obj.train_epochs /
                            flags_obj.epochs_between_evals)
        schedule = [
            flags_obj.epochs_between_evals for _ in range(int(n_loops))
        ]
        schedule[-1] = flags_obj.train_epochs - sum(
            schedule[:-1])  # over counting.

    with tf.Session() as sess:
        run_metadata = tf.RunMetadata()
        for cycle_index, num_train_epochs in enumerate(schedule):
            tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))
            if num_train_epochs:
                sess.run(classifier.train(
                    input_fn=lambda: input_fn_train(num_train_epochs),
                    hooks=train_hooks,
                    max_steps=flags_obj.max_train_steps),
                         run_metadata=run_metadata)

            tf.logging.info('Starting to evaluate.')

            # flags_obj.max_train_steps is generally associated with testing and
            # profiling. As a result it is frequently called with synthetic data, which
            # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
            # eval (which is generally unimportant in those circumstances) to terminate.
            # Note that eval will run for max_train_steps each loop, regardless of the
            # global_step count.

            eval_results = sess.run(classifier.evaluate(
                input_fn=input_fn_eval, steps=flags_obj.max_train_steps),
                                    run_metadata=run_metadata)
            benchmark_logger.log_evaluation_result(eval_results)

            if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                                 eval_results['accuracy']):
                break

        fetched_timeline = timeline.Timeline(run_metadata.step_stats)
        chrome_trace = fetched_timeline.generate_chrome_trace_format()
        with open('timeline_run_loop.json', 'w') as f:
            f.write(chrome_trace)

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        export_dtype = flags_core.get_tf_dtype(flags_obj)
        if flags_obj.image_bytes_as_serving_input:
            input_receiver_fn = functools.partial(image_bytes_serving_input_fn,
                                                  shape,
                                                  dtype=export_dtype)
        else:
            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
                shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
        classifier.export_savedmodel(flags_obj.export_dir,
                                     input_receiver_fn,
                                     strip_default_attrs=True)
Exemple #41
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config)

    # initialize our model with all but the dense layer from pretrained resnet
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj),
            'fine_tune': flags_obj.fine_tune
        })

    run_params = {
        'batch_size': flags_obj.batch_size,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train(num_epochs):
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=num_epochs,
            num_gpus=flags_core.get_num_gpus(flags_obj),
            dtype=flags_core.get_tf_dtype(flags_obj))

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    if flags_obj.eval_only or not flags_obj.train_epochs:
        # If --eval_only is set, perform a single loop with zero train epochs.
        schedule, n_loops = [0], 1
    else:
        # Compute the number of times to loop while training. All but the last
        # pass will train for `epochs_between_evals` epochs, while the last will
        # train for the number needed to reach `training_epochs`. For instance if
        #   train_epochs = 25 and epochs_between_evals = 10
        # schedule will be set to [10, 10, 5]. That is to say, the loop will:
        #   Train for 10 epochs and then evaluate.
        #   Train for another 10 epochs and then evaluate.
        #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
        n_loops = math.ceil(flags_obj.train_epochs /
                            flags_obj.epochs_between_evals)
        schedule = [
            flags_obj.epochs_between_evals for _ in range(int(n_loops))
        ]
        schedule[-1] = flags_obj.train_epochs - sum(
            schedule[:-1])  # over counting.

    for cycle_index, num_train_epochs in enumerate(schedule):
        tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

        if num_train_epochs:
            classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                             hooks=train_hooks,
                             max_steps=flags_obj.max_train_steps)

        tf.logging.info('Starting to evaluate.')

        # flags_obj.max_train_steps is generally associated with testing and
        # profiling. As a result it is frequently called with synthetic data, which
        # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
        # eval (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags_obj.max_train_steps)

        benchmark_logger.log_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
Exemple #42
0
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    tf.set_random_seed(flags_obj.seed)
    # Data preprocessing
    tf.logging.info("Data preprocessing...")
    '''
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)
    '''
    #train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    #eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = 30
    #len(train_speech_dataset.speech_labels)

    # Use distribution strategy for multi-gpu training
    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus)

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      flags_obj.tpu,
      zone=flags_obj.tpu_zone,
      project=flags_obj.gcp_project
    )

    run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=flags_obj.model_dir,
      
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      
      tpu_config=tf.contrib.tpu.TPUConfig(flags_obj.iterations,flags_obj.num_shards),
    )

    #run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      model_dir=flags_obj.model_dir,
      use_tpu=flags_obj.use_tpu,
      train_batch_size=flags_obj.batch_size,
      eval_batch_size=flags_obj.batch_size,
      params={"num_classes": num_classes,
        },
      config=run_config)

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias,
    }

    dataset_name = "Tuda Data"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(
        "deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id
    )

    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size
    )

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, num_gpus
    )
    #TODO

    def input_fn_train(params):
        #ds = dataset.input_fn(per_device_batch_size, train_speech_dataset)
        ds = test.input_fn(per_device_batch_size,'/content/records_test.csv')
        return ds

    def input_fn_eval(params):
        return test.input_fn(params['batch_size'], eval_speech_dataset)

    #def input_fn_predict(features, batch_size):
        #dataset = tf.data.Dataset.from_tensor_slices(features)
        #dataset = dataset.batch(batch_size)
        #return dataset
    #    return dataset.input_fn(per_device_batch_size, eval_speech_dataset)

    total_training_cycle = flags_obj.train_epochs // flags_obj.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info(
            "Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle
        )

        # Perform batch_wise dataset shuffling
        '''
Exemple #43
0
def run_mnist(flags_obj):
  """Run MNIST training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  model_helpers.apply_clean(flags_obj)
  model_function = model_fn

  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  distribution_strategy = distribution_utils.get_distribution_strategy(
      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy, session_config=session_config)

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
      model_fn=model_function,
      model_dir=flags_obj.model_dir,
      config=run_config,
      params={
          'data_format': data_format,
      })

  # Set up training and evaluation input functions.
  def train_input_fn():
    """Prepare data for training."""

    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes use less memory. MNIST is a small
    # enough dataset that we can easily shuffle the full epoch.
    ds = dataset.train(flags_obj.data_dir)
    ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

    # Iterate through the dataset a set number (`epochs_between_evals`) of times
    # during each training session.
    ds = ds.repeat(flags_obj.epochs_between_evals)
    return ds

  def eval_input_fn():
    return dataset.test(flags_obj.data_dir).batch(
        flags_obj.batch_size).make_one_shot_iterator().get_next()

  # Set up hook that outputs training logs every 100 steps.
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  # Train and evaluate model.
  for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('\nEvaluation results:\n\t%s\n' % eval_results)

    if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                         eval_results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    image = tf.placeholder(tf.float32, [None, 28, 28])
    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
        'image': image,
    })
    mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn,
                                       strip_default_attrs=True)
Exemple #44
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  # Data preprocessing
  # The file name of training and test dataset
  train_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME)
  test_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME)
  neg_fname = os.path.join(
      FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME)

  assert os.path.exists(train_fname), (
      "Run data_download.py first to download and extract {} dataset".format(
          FLAGS.dataset))

  tf.logging.info("Data preprocessing...")
  ncf_dataset = dataset.data_preprocessing(
      train_fname, test_fname, neg_fname, FLAGS.num_neg)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in FLAGS.layers]
  mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors,
      layers, FLAGS.batch_size, FLAGS.mf_regularization,
      mlp_regularization)
  num_gpus = flags_core.get_num_gpus(FLAGS)
  estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Training and evaluation cycle
  def train_input_fn():
    return dataset.input_fn(
        True,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.epochs_between_evals)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=train_input_fn, hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, FLAGS.batch_size, num_gpus, ncf_dataset)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
def resnet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
    Dict of results of the run.
  """

  model_helpers.apply_clean(flags.FLAGS)

  # Ensures flag override logic is only executed if explicitly triggered.
  if flags_obj.tf_gpu_thread_mode:
    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

  # Creates session config. allow_soft_placement = True, is required for
  # multi-GPU and is not harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  distribution_strategy = distribution_utils.get_distribution_strategy(
      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  # results in checkpoints determined only by `epochs_between_evals`.
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy,
      session_config=session_config,
      save_checkpoints_secs=60*60*24)

  # Initializes model with all but the dense layer from pretrained ResNet.
  if flags_obj.pretrained_model_checkpoint_path is not None:
    warm_start_settings = tf.estimator.WarmStartSettings(
        flags_obj.pretrained_model_checkpoint_path,
        vars_to_warm_start='^(?!.*dense)')
  else:
    warm_start_settings = None

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      warm_start_from=warm_start_settings, params={
          'resnet_size': int(flags_obj.resnet_size),
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'resnet_version': int(flags_obj.resnet_version),
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj),
          'fine_tune': flags_obj.fine_tune
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'resnet_size': flags_obj.resnet_size,
      'resnet_version': flags_obj.resnet_version,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + '-synthetic'

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  def input_fn_train(num_epochs):
    return input_function(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=num_epochs,
        dtype=flags_core.get_tf_dtype(flags_obj),
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        num_parallel_batches=flags_obj.datasets_num_parallel_batches)

  def input_fn_eval():
    return input_function(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1,
        dtype=flags_core.get_tf_dtype(flags_obj))

  if flags_obj.eval_only or not flags_obj.train_epochs:
    # If --eval_only is set, perform a single loop with zero train epochs.
    schedule, n_loops = [0], 1
  else:
    # Compute the number of times to loop while training. All but the last
    # pass will train for `epochs_between_evals` epochs, while the last will
    # train for the number needed to reach `training_epochs`. For instance if
    #   train_epochs = 25 and epochs_between_evals = 10
    # schedule will be set to [10, 10, 5]. That is to say, the loop will:
    #   Train for 10 epochs and then evaluate.
    #   Train for another 10 epochs and then evaluate.
    #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
    n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals)
    schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
    schedule[-1] = flags_obj.train_epochs - sum(schedule[:-1])  # over counting.

  for cycle_index, num_train_epochs in enumerate(schedule):
    tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

    if num_train_epochs:
      classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                       hooks=train_hooks, max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    export_dtype = flags_core.get_tf_dtype(flags_obj)
    if flags_obj.image_bytes_as_serving_input:
      input_receiver_fn = functools.partial(
          image_bytes_serving_input_fn, shape, dtype=export_dtype)
    else:
      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
                                 strip_default_attrs=True)
  return eval_results
Exemple #46
0
 def input_fn_eval():
   return input_function(
       is_training=False, data_dir=flags_obj.data_dir,
       batch_size=per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=1)
Exemple #47
0
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    tf.set_random_seed(flags_obj.seed)
    # Data preprocessing
    tf.logging.info("Data preprocessing...")
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = len(train_speech_dataset.speech_labels)

    # Use distribution strategy for multi-gpu training
    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(
        num_gpus)
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       config=run_config,
                                       params={
                                           "num_classes": num_classes,
                                       })

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return dataset.input_fn(per_device_batch_size, train_speech_dataset)

    def input_fn_eval():
        return dataset.input_fn(per_device_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1,
                        total_training_cycle)

        # Perform batch_wise dataset shuffling
        train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
            train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
            flags_obj.batch_size)

        estimator.train(input_fn=input_fn_train, hooks=train_hooks)

        # Evaluation
        tf.logging.info("Starting to evaluate...")

        eval_results = evaluate_model(estimator,
                                      eval_speech_dataset.speech_labels,
                                      eval_speech_dataset.entries,
                                      input_fn_eval)

        # Log the WER and CER results.
        benchmark_logger.log_evaluation_result(eval_results)
        tf.logging.info("Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(flags_obj.wer_threshold,
                                             eval_results[_WER_KEY]):
            break
Exemple #48
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)
    movielens_dataset.construct_train_eval_csv(
        data_dir=FLAGS.data_dir, dataset=FLAGS.dataset)

  tf.logging.info("Data preprocessing...")
  ncf_dataset = movielens_dataset.data_preprocessing(
      FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg)

  model_helpers.apply_clean(flags.FLAGS)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in FLAGS.layers]
  mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors,
      layers, FLAGS.batch_size, FLAGS.mf_regularization,
      mlp_regularization)
  num_gpus = flags_core.get_num_gpus(FLAGS)
  estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  # Training and evaluation cycle
  def get_train_input_fn():
    return movielens_dataset.get_input_fn(
        True,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)

  def get_pred_input_fn():
    return movielens_dataset.get_input_fn(
        False,
        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
        ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn())

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()