def test_batch_size(self):
     self.assertEquals(
         distribution_utils.per_device_batch_size(147, num_gpus=0), 147)
     self.assertEquals(
         distribution_utils.per_device_batch_size(147, num_gpus=1), 147)
     self.assertEquals(
         distribution_utils.per_device_batch_size(147, num_gpus=7), 21)
 def test_batch_size(self):
   self.assertEquals(
       distribution_utils.per_device_batch_size(147, num_gpus=0), 147)
   self.assertEquals(
       distribution_utils.per_device_batch_size(147, num_gpus=1), 147)
   self.assertEquals(
       distribution_utils.per_device_batch_size(147, num_gpus=7), 21)
Example #3
0
    def input_fn_train(start_index, num_steps):
        #########################################################

        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),

            #################### My Changes #########################
            # """
            # purpose -- val by steps not by epochs
            # change -- add two args, start_index and num steps, remove num_epochs
            # let num_epochs alwasys = 1
            # """
            # purpose 2: add one more argument: image_size
            start_index=start_index,
            num_steps=num_steps,
            image_size=flags_obj.image_size,
            # num_epochs=num_epochs,
            #########################################################
            dtype=flags_core.get_tf_dtype(flags_obj),
            datasets_num_private_threads=flags_obj.
            datasets_num_private_threads,
            num_parallel_batches=flags_obj.datasets_num_parallel_batches)
Example #4
0
 def __call__(self):
   return input_function(
       is_training=True, data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=self._num_epochs,
       num_gpus=flags_core.get_num_gpus(flags_obj))
Example #5
0
 def input_fn_eval():
   return input_function(
       is_training=False, data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=1,
       dtype=flags_core.get_tf_dtype(flags_obj))
Example #6
0
 def input_fn_train():
   return input_function(
       is_training=True, data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=flags_obj.epochs_between_evals,
       num_gpus=flags_core.get_num_gpus(flags_obj))
Example #7
0
 def input_fn_train(num_epochs):
   return input_function(
       is_training=True, data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       num_gpus=flags_core.get_num_gpus(flags_obj))
Example #8
0
 def input_fn_eval():
     return input_function(
         is_training=False,
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_device_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=1,
         sg_settings=net_data_configs['sg_settings'])
Example #9
0
 def input_fn_eval():
   return input_function(
       is_training=False,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=1,
       dtype=flags_core.get_tf_dtype(flags_obj))
Example #10
0
 def input_fn_predict():
     return input_function(
         is_training=False,
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_device_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=1,
         get_one_item=False,
         conf_matrix=conf_matrix)
 def input_fn_eval():
     return input_function(
         is_training=False,
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_device_batch_size(
             #flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
             flags_obj.batch_size,
             1),  # Xinyi modified, get_num_gpus() will occupy all GPUs
         num_epochs=1)
Example #12
0
 def input_fn_train(num_epochs):
     return input_function(
         mode="train",
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_device_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=num_epochs,
         num_gpus=flags_core.get_num_gpus(flags_obj),
         dtype=flags_core.get_tf_dtype(flags_obj))
Example #13
0
 def input_fn_train(num_epochs):
   return input_function(
       is_training=True,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       dtype=flags_core.get_tf_dtype(flags_obj),
       datasets_num_private_threads=flags_obj.datasets_num_private_threads,
       num_parallel_batches=flags_obj.datasets_num_parallel_batches)
Example #14
0
 def input_fn_train(num_epochs):
   return input_function(
       is_training=True,
       data_dir=flags_obj.data_dir,
       batch_size=distribution_utils.per_device_batch_size(
           flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
       num_epochs=num_epochs,
       dtype=flags_core.get_tf_dtype(flags_obj),
       datasets_num_private_threads=flags_obj.datasets_num_private_threads,
       num_parallel_batches=flags_obj.datasets_num_parallel_batches)
Example #15
0
 def input_fn_train(num_epochs):
     return input_function(
         is_training=True,
         data_dir=flags_obj.data_dir,
         batch_size=distribution_utils.per_device_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=num_epochs,
         num_gpus=flags_core.get_num_gpus(flags_obj),
         examples_per_epoch=flags_obj.examples_per_epoch,
         sg_settings=net_data_configs['sg_settings'])
Example #16
0
def input_fn_cls(is_training, use_random_crop, num_epochs, flags_obj):
    if flags_obj.mixup_type == 1 and is_training:
        batch_size = flags_obj.batch_size * 2
        num_epochs = num_epochs * 2
    else:
        batch_size = flags_obj.batch_size

    batch_size = distribution_utils.per_device_batch_size(
        batch_size, flags_core.get_num_gpus(flags_obj))
    filenames_sup = data_util.get_filenames(is_training,
                                            flags_obj.data_dir,
                                            train_regex=flags_obj.train_regex,
                                            val_regex=flags_obj.val_regex)
    tf.logging.info('The # of Supervised tfrecords: {}'.format(
        len(filenames_sup)))
    dataset_meta = data_config.get_config(flags_obj.dataset_name)
    datasets = []
    dataset_sup = input_fn(is_training,
                           filenames_sup,
                           use_random_crop,
                           batch_size,
                           dataset_meta.num_train_files,
                           dataset_meta.num_images['train'],
                           dataset_meta.shuffle_buffer,
                           dataset_meta.num_channels,
                           num_epochs,
                           flags_core.get_num_gpus(flags_obj),
                           flags_core.get_tf_dtype(flags_obj),
                           autoaugment_type=flags_obj.autoaugment_type,
                           with_drawing_bbox=flags_obj.with_drawing_bbox,
                           drop_remainder=False,
                           preprocessing_type=flags_obj.preprocessing_type,
                           return_logits=flags_obj.kd_temp > 0,
                           dct_method=flags_obj.dct_method,
                           parse_record_fn=data_util.parse_record_sup)
    datasets.append(dataset_sup)

    def flatten_input(*features):
        images_dict = {}
        for feature in features:
            for key in feature:
                if key == 'label':
                    label = feature[key]
                else:
                    images_dict[key] = feature[key]
        return images_dict, label

    dataset = tf.data.Dataset.zip(tuple(datasets))
    dataset = dataset.map(flatten_input)
    tf.logging.info('dataset = dataset.map(flatten_input)')
    tf.logging.info(dataset)
    return dataset
Example #17
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
  ncf_dataset = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
      batch_size=batch_size,
      eval_batch_size=eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=FLAGS.epochs_between_evals,
      match_mlperf=FLAGS.ml_perf)

  model_helpers.apply_clean(flags.FLAGS)

  train_estimator, eval_estimator = construct_estimator(
      num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
          "batch_size": batch_size,
          "learning_rate": FLAGS.learning_rate,
          "num_users": ncf_dataset.num_users,
          "num_items": ncf_dataset.num_items,
          "mf_dim": FLAGS.num_factors,
          "model_layers": [int(layer) for layer in FLAGS.layers],
          "mf_regularization": FLAGS.mf_regularization,
          "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
          "use_tpu": FLAGS.tpu is not None,
          "tpu": FLAGS.tpu,
          "tpu_zone": FLAGS.tpu_zone,
          "tpu_gcp_project": FLAGS.tpu_gcp_project,
      }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)

  approx_train_steps = int(ncf_dataset.num_train_positives
                           * (1 + FLAGS.num_neg) // FLAGS.batch_size)
  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)

  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))


    # Train the model
    train_input_fn, train_record_dir, batch_count = \
      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

    if np.abs(approx_train_steps - batch_count) > 1:
      tf.logging.warning(
          "Estimated ({}) and reported ({}) number of batches differ by more "
          "than one".format(approx_train_steps, batch_count))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                          steps=batch_count)
    tf.gfile.DeleteRecursively(train_record_dir)

    # Evaluate the model
    eval_results = evaluate_model(
        eval_estimator, ncf_dataset, pred_input_fn)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # Some of the NumPy vector math can be quite large and likes to stay in
    # memory for a while.
    gc.collect()

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
Example #18
0
 def pred_input_fn():
     return dataset.input_fn(
         False,
         distribution_utils.per_device_batch_size(batch_size, num_gpus),
         ncf_dataset)
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.
  """
    if flags_obj.enable_eager:
        tf.enable_eager_execution()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, flags_core.get_num_gpus(flags_obj))

    # pylint: disable=protected-access
    if flags_obj.use_synthetic_data:
        input_fn = keras_common.get_synth_input_fn(
            height=imagenet_main.DEFAULT_IMAGE_SIZE,
            width=imagenet_main.DEFAULT_IMAGE_SIZE,
            num_channels=imagenet_main.NUM_CHANNELS,
            num_classes=imagenet_main.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj))
    else:
        input_fn = imagenet_main.input_fn

    train_input_dataset = input_fn(is_training=True,
                                   data_dir=flags_obj.data_dir,
                                   batch_size=per_device_batch_size,
                                   num_epochs=flags_obj.train_epochs,
                                   parse_record_fn=parse_record_keras)

    eval_input_dataset = input_fn(is_training=False,
                                  data_dir=flags_obj.data_dir,
                                  batch_size=per_device_batch_size,
                                  num_epochs=flags_obj.train_epochs,
                                  parse_record_fn=parse_record_keras)

    strategy = distribution_utils.get_distribution_strategy(
        flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

    strategy_scope = keras_common.get_strategy_scope(strategy)

    with strategy_scope:
        optimizer = keras_common.get_optimizer()
        model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES)

        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['sparse_categorical_accuracy'])

    time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
        learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])

    train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        # Only build the training graph. This reduces memory usage introduced by
        # control flow ops in layers that have different implementations for
        # training and inference (e.g., batch norm).
        tf.keras.backend.set_learning_phase(1)
        num_eval_steps = None
        validation_data = None

    history = model.fit(
        train_input_dataset,
        epochs=train_epochs,
        steps_per_epoch=train_steps,
        callbacks=[time_callback, lr_callback, tensorboard_callback],
        validation_steps=num_eval_steps,
        validation_data=validation_data,
        verbose=1)

    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=1)
    stats = keras_common.build_stats(history, eval_output, time_callback)
    return stats
Example #20
0
 def get_train_input_fn():
   return movielens_dataset.get_input_fn(
       True,
       distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
       ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)
Example #21
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
  eval_batch_size = int(FLAGS.eval_batch_size or
                        max([FLAGS.batch_size, eval_per_user]))
  if eval_batch_size % eval_per_user:
    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
    tf.logging.warning(
        "eval examples per user does not evenly divide eval_batch_size. "
        "Overriding to {}".format(eval_batch_size))

  if FLAGS.use_synthetic_data:
    ncf_dataset = None
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        num_cycles=total_training_cycle,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
    num_train_steps = int(np.ceil(
        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
        (1 + FLAGS.num_neg) / FLAGS.batch_size))
    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
                                 ncf_dataset.num_users / eval_batch_size))

  model_helpers.apply_clean(flags.FLAGS)

  params = {
      "use_seed": FLAGS.seed is not None,
      "hash_pipeline": FLAGS.hash_pipeline,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": FLAGS.learning_rate,
      "num_users": num_users,
      "num_items": num_items,
      "mf_dim": FLAGS.num_factors,
      "model_layers": [int(layer) for layer in FLAGS.layers],
      "mf_regularization": FLAGS.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
      "num_neg": FLAGS.num_neg,
      "use_tpu": FLAGS.tpu is not None,
      "tpu": FLAGS.tpu,
      "tpu_zone": FLAGS.tpu_zone,
      "tpu_gcp_project": FLAGS.tpu_gcp_project,
      "beta1": FLAGS.beta1,
      "beta2": FLAGS.beta2,
      "epsilon": FLAGS.epsilon,
      "match_mlperf": FLAGS.ml_perf,
      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
      "use_estimator": FLAGS.use_estimator,
  }
  if FLAGS.use_estimator:
    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
        iterations=num_train_steps, params=params,
        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
  else:
    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
                                         num_eval_steps, FLAGS.use_while_loop)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)


  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    # Train the model
    if FLAGS.use_estimator:
      train_input_fn, train_record_dir, batch_count = \
        data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=True)

      if batch_count != num_train_steps:
        raise ValueError(
            "Step counts do not match. ({} vs. {}) The async process is "
            "producing incorrect shards.".format(batch_count, num_train_steps))

      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                            steps=num_train_steps)
      if train_record_dir:
        tf.gfile.DeleteRecursively(train_record_dir)

      tf.logging.info("Beginning evaluation.")
      if eval_input_fn is None:
        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=False)

        if eval_batch_count != num_eval_steps:
          raise ValueError(
              "Step counts do not match. ({} vs. {}) The async process is "
              "producing incorrect shards.".format(
                  eval_batch_count, num_eval_steps))

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = eval_estimator.evaluate(eval_input_fn,
                                             steps=num_eval_steps)
      tf.logging.info("Evaluation complete.")
    else:
      runner.train()
      tf.logging.info("Beginning evaluation.")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = runner.eval()
      tf.logging.info("Evaluation complete.")
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    # Logged by the async process during record creation.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            deferred=True)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Example #22
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    if FLAGS.use_synthetic_data:
        ncf_dataset = None
        cleanup_fn = lambda: None
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_neg=FLAGS.num_neg,
            epochs_per_cycle=FLAGS.epochs_between_evals,
            match_mlperf=FLAGS.ml_perf,
            deterministic=FLAGS.seed is not None,
            use_subprocess=FLAGS.use_subprocess,
            cache_id=FLAGS.cache_id)
        num_users = ncf_dataset.num_users
        num_items = ncf_dataset.num_items
        num_train_steps = int(
            np.ceil(FLAGS.epochs_between_evals *
                    ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) /
                    FLAGS.batch_size))
        num_eval_steps = int(
            np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users /
                    eval_batch_size))

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "eval_batch_size": eval_batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": num_users,
            "num_items": num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
            "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    pred_input_fn = None
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_input_fn(
              ncf_dataset=ncf_dataset, is_training=True)

        if batch_count != num_train_steps:
            raise ValueError(
                "Step counts do not match. ({} vs. {}) The async process is "
                "producing incorrect shards.".format(batch_count,
                                                     num_train_steps))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=num_train_steps)
        if train_record_dir:
            tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        if pred_input_fn is None:
            pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
                ncf_dataset=ncf_dataset, is_training=False)

            if eval_batch_count != num_eval_steps:
                raise ValueError(
                    "Step counts do not match. ({} vs. {}) The async process is "
                    "producing incorrect shards.".format(
                        eval_batch_count, num_eval_steps))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = eval_estimator.evaluate(pred_input_fn,
                                               steps=num_eval_steps)
        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        tf.logging.info("Evaluation complete.")

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        # Logged by the async process during record creation.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                deferred=True)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Example #23
0
def run_deep_speech(_):
  """Run deep speech training and eval loop."""
  tf.set_random_seed(flags_obj.seed)
  # Data preprocessing
  tf.logging.info("Data preprocessing...")
  train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
  eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

  # Number of label classes. Label string is "[a-z]' -"
  num_classes = len(train_speech_dataset.speech_labels)

  # Use distribution strategy for multi-gpu training
  num_gpus = flags_core.get_num_gpus(flags_obj)
  distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus)
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      model_dir=flags_obj.model_dir,
      config=run_config,
      params={
          "num_classes": num_classes,
      }
  )

  # Benchmark logging
  run_params = {
      "batch_size": flags_obj.batch_size,
      "train_epochs": flags_obj.train_epochs,
      "rnn_hidden_size": flags_obj.rnn_hidden_size,
      "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
      "rnn_type": flags_obj.rnn_type,
      "is_bidirectional": flags_obj.is_bidirectional,
      "use_bias": flags_obj.use_bias
  }

  dataset_name = "LibriSpeech"
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info("deep_speech", dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  per_device_batch_size = distribution_utils.per_device_batch_size(
      flags_obj.batch_size, num_gpus)

  def input_fn_train():
    return dataset.input_fn(
        per_device_batch_size, train_speech_dataset)

  def input_fn_eval():
    return dataset.input_fn(
        per_device_batch_size, eval_speech_dataset)

  total_training_cycle = (flags_obj.train_epochs //
                          flags_obj.epochs_between_evals)
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: %d/%d",
                    cycle_index + 1, total_training_cycle)

    # Perform batch_wise dataset shuffling
    train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
        train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
        flags_obj.batch_size)

    estimator.train(input_fn=input_fn_train, hooks=train_hooks)

    # Evaluation
    tf.logging.info("Starting to evaluate...")

    eval_results = evaluate_model(
        estimator, eval_speech_dataset.speech_labels,
        eval_speech_dataset.entries, input_fn_eval)

    # Log the WER and CER results.
    benchmark_logger.log_evaluation_result(eval_results)
    tf.logging.info(
        "Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(
        flags_obj.wer_threshold, eval_results[_WER_KEY]):
      break
def run_mnist(flags_obj):
    """Run MNIST training and eval loop.
  Args:
    flags_obj: An object containing parsed flag values.
  """
    model_helpers.apply_clean(flags_obj)
    model_function = model_fn

    # Get number of GPUs as defined by the --num_gpus flags and the number of
    # GPUs available on the machine.
    num_gpus = flags_core.get_num_gpus(flags_obj)
    multi_gpu = num_gpus > 1

    if multi_gpu:
        # Validate that the batch size can be split into devices.
        distribution_utils.per_device_batch_size(flags_obj.batch_size,
                                                 num_gpus)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_fn,
            loss_reduction=tf.losses.Reduction.MEAN,
            devices=["/device:GPU:%d" % d for d in range(num_gpus)])

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    mnist_classifier = tf.estimator.Estimator(model_fn=model_function,
                                              model_dir=flags_obj.model_dir,
                                              params={
                                                  'data_format': data_format,
                                                  'multi_gpu': multi_gpu
                                              })

    # Set up training and evaluation input functions.
    def train_input_fn():
        """Prepare data for training."""

        # When choosing shuffle buffer sizes, larger sizes result in better
        # randomness, while smaller sizes use less memory. MNIST is a small
        # enough dataset that we can easily shuffle the full epoch.
        ds = dataset.train(flags_obj.data_dir)
        ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

        # Iterate through the dataset a set number (`epochs_between_evals`) of times
        # during each training session.
        ds = ds.repeat(flags_obj.epochs_between_evals)
        return ds

    def eval_input_fn():
        return dataset.test(flags_obj.data_dir).batch(
            flags_obj.batch_size).make_one_shot_iterator().get_next()

    # Set up hook that outputs training logs every 100 steps.
    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    # Train and evaluate model.
    for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
        mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
        eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
        print('\nEvaluation results:\n\t%s\n' % eval_results)

        if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                             eval_results['accuracy']):
            break

    # Export the model
    if flags_obj.export_dir is not None:
        image = tf.placeholder(tf.float32, [None, 28, 28])
        input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
            'image':
            image,
        })
        mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)
Example #25
0
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    tf.set_random_seed(flags_obj.seed)
    # Data preprocessing
    tf.logging.info("Data preprocessing...")
    '''
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)
    '''
    #train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    #eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = 30
    #len(train_speech_dataset.speech_labels)

    # Use distribution strategy for multi-gpu training
    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus)

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      flags_obj.tpu,
      zone=flags_obj.tpu_zone,
      project=flags_obj.gcp_project
    )

    run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=flags_obj.model_dir,
      
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      
      tpu_config=tf.contrib.tpu.TPUConfig(flags_obj.iterations,flags_obj.num_shards),
    )

    #run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      model_dir=flags_obj.model_dir,
      use_tpu=flags_obj.use_tpu,
      train_batch_size=flags_obj.batch_size,
      eval_batch_size=flags_obj.batch_size,
      params={"num_classes": num_classes,
        },
      config=run_config)

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias,
    }

    dataset_name = "Tuda Data"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(
        "deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id
    )

    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size
    )

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, num_gpus
    )
    #TODO

    def input_fn_train(params):
        #ds = dataset.input_fn(per_device_batch_size, train_speech_dataset)
        ds = test.input_fn(per_device_batch_size,'/content/records_test.csv')
        return ds

    def input_fn_eval(params):
        return test.input_fn(params['batch_size'], eval_speech_dataset)

    #def input_fn_predict(features, batch_size):
        #dataset = tf.data.Dataset.from_tensor_slices(features)
        #dataset = dataset.batch(batch_size)
        #return dataset
    #    return dataset.input_fn(per_device_batch_size, eval_speech_dataset)

    total_training_cycle = flags_obj.train_epochs // flags_obj.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info(
            "Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle
        )

        # Perform batch_wise dataset shuffling
        '''
Example #26
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)
    eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
    ncf_dataset = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size  # for ExamplesPerSecondHook
    )
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        if not FLAGS.inference_only:
            # Train the model
            train_input_fn, train_record_dir, batch_count = \
                data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

            if np.abs(approx_train_steps - batch_count) > 1:
                tf.logging.warning(
                    "Estimated ({}) and reported ({}) number of batches differ by more "
                    "than one".format(approx_train_steps, batch_count))
            train_estimator.train(input_fn=train_input_fn,
                                  hooks=train_hooks,
                                  steps=batch_count)
            tf.gfile.DeleteRecursively(train_record_dir)

        # Evaluate the model
        eval_results = evaluate_model(eval_estimator, ncf_dataset,
                                      pred_input_fn)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[_HR_KEY]
        ndcg = eval_results[_NDCG_KEY]
        tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # Export SavedModel
        if FLAGS.export_savedmodel:
            eval_estimator.export_savedmodel(FLAGS.model_dir,
                                             serving_input_receiver_fn)
            print("SavedModel successfully exported to: {}/<timestamp>".format(
                FLAGS.model_dir))

        # Some of the NumPy vector math can be quite large and likes to stay in
        # memory for a while.
        gc.collect()

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Example #27
0
def run_transformer(flags_obj):
    print("run_transformer")
    """Create tf.Estimator to train and evaluate transformer model.
  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["static_batch"] = flags_obj.static_batch
    params["allow_ffn_pad"] = True

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (flags_obj.batch_size
                            or params["default_batch_size"])

    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=False,
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=False  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    #  estimator = construct_estimator(flags_obj, params, schedule_manager)

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        intra_op_parallelism_threads=0,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    print("SESS_CONFIG: ", sess_config)
    config = RunConfig(session_config=sess_config,
                       model_dir=params["model_dir"])
    variable_strategy = 'GPU'
    use_distortion_for_training = True
    experiment_fn = get_experiment_fn(config.is_chief, flags_obj, params,
                                      schedule_manager, num_gpus,
                                      variable_strategy,
                                      use_distortion_for_training)

    #tf.contrib.learn.learn_runner.run(experiment_fn, run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams))

    tf.contrib.learn.learn_runner.run(experiment_fn,
                                      run_config=config,
                                      hparams=tf.contrib.training.HParams(
                                          is_chief=config.is_chief, **params))
    '''
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)
  '''

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
Example #28
0
 def get_pred_input_fn():
     return movielens_dataset.get_input_fn(
         False,
         distribution_utils.per_device_batch_size(FLAGS.batch_size,
                                                  num_gpus), ncf_dataset,
         FLAGS.data_dir, FLAGS.dataset, 1)
Example #29
0
 def get_train_input_fn():
     return movielens_dataset.get_input_fn(
         True,
         distribution_utils.per_device_batch_size(FLAGS.batch_size,
                                                  num_gpus), ncf_dataset,
         FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals)
Example #30
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None)

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": ncf_dataset.num_users,
            "num_items": ncf_dataset.num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    approx_train_steps = int(ncf_dataset.num_train_positives *
                             (1 + FLAGS.num_neg) // FLAGS.batch_size)
    pred_input_fn = data_preprocessing.make_pred_input_fn(
        ncf_dataset=ncf_dataset)

    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)

        if np.abs(approx_train_steps - batch_count) > 1:
            tf.logging.warning(
                "Estimated ({}) and reported ({}) number of batches differ by more "
                "than one".format(approx_train_steps, batch_count))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=batch_count)
        tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        eval_results = eval_estimator.evaluate(pred_input_fn)
        tf.logging.info("Evaluation complete.")

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        hr = eval_results[rconst.HR_KEY]
        ndcg = eval_results[rconst.NDCG_KEY]
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            break

    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
Example #31
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file},
        strip_default_attrs=True)
Example #32
0
 def train_input_fn():
   return dataset.input_fn(
       True,
       distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
       ncf_dataset, FLAGS.epochs_between_evals)
Example #33
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file},
        strip_default_attrs=True)
Example #34
0
 def pred_input_fn():
   return dataset.input_fn(
       False, distribution_utils.per_device_batch_size(batch_size, num_gpus),
       ncf_dataset)
def run(flags_obj):
    """Run ResNet Cifar-10 training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    if flags_obj.enable_eager:
        tf.enable_eager_execution()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, flags_core.get_num_gpus(flags_obj))

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    if flags_obj.use_synthetic_data:
        input_fn = keras_common.get_synth_input_fn(
            height=cifar_main.HEIGHT,
            width=cifar_main.WIDTH,
            num_channels=cifar_main.NUM_CHANNELS,
            num_classes=cifar_main.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj))
    else:
        input_fn = cifar_main.input_fn

    train_input_dataset = input_fn(is_training=True,
                                   data_dir=flags_obj.data_dir,
                                   batch_size=per_device_batch_size,
                                   num_epochs=flags_obj.train_epochs,
                                   parse_record_fn=parse_record_keras)

    eval_input_dataset = input_fn(is_training=False,
                                  data_dir=flags_obj.data_dir,
                                  batch_size=per_device_batch_size,
                                  num_epochs=flags_obj.train_epochs,
                                  parse_record_fn=parse_record_keras)

    strategy = distribution_utils.get_distribution_strategy(
        flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

    strategy_scope = keras_common.get_strategy_scope(strategy)

    with strategy_scope:
        optimizer = keras_common.get_optimizer()
        model = resnet_cifar_model.resnet56(classes=cifar_main.NUM_CLASSES)

        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['categorical_accuracy'])

    time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
        learning_rate_schedule, cifar_main.NUM_IMAGES['train'])

    train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        tf.keras.backend.set_learning_phase(1)
        num_eval_steps = None
        validation_data = None

    history = model.fit(
        train_input_dataset,
        epochs=train_epochs,
        steps_per_epoch=train_steps,
        callbacks=[time_callback, lr_callback, tensorboard_callback],
        validation_steps=num_eval_steps,
        validation_data=validation_data,
        verbose=1)
    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=1)
    stats = keras_common.build_stats(history, eval_output, time_callback)
    return stats
Example #36
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  ### yr
  params['vocab_size_in']=6100
  params['vocab_size_out'] = 25
  #params['vocab_size']='s'
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  mode = tf.estimator.ModeKeys.TRAIN

  ## build ph
  inputs_ph = tf.placeholder(tf.int32, shape=(None, None), name='inputs')
  targets_ph = tf.placeholder(tf.int32, shape=(None, None), name='targets')
  targets_ph_2 = tf.placeholder(tf.int32, shape=(None, None), name='targets2')

  loss_M, train_op_M = model_fn(inputs_ph, targets_ph, targets_ph_2, mode, params)


  print('Using GPU in Decoder')
  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
  sess = tf.Session(
          config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options))
  with sess.as_default():
      sess.run(tf.global_variables_initializer())
      saver = tf.train.Saver()

      if os.path.exists(os.path.join(FLAGS.model_dir, "checkpoint")):
          saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))


      cnt=0
      for ii in xrange(schedule_manager.train_eval_iterations):
          tf.logging.info("Starting iteration %d" % (ii + 1))
          ## get data
          random.shuffle(datall)
          for data in datall:
                feed = {inputs_ph:data['inputs'],
                      targets_ph: data['targets'],
                      targets_ph_2:data['targets2']}


                loss, train_op = sess.run([loss_M, train_op_M], feed_dict=feed)
                cnt+=1
                print loss

                    ##
                if cnt%100==0:
                    print 'loss at %d'%cnt,loss

                if cnt%2000==0:
                    filename = os.path.join(
                    FLAGS.model_dir, "model_{}.ckpt".format(cnt))
                    saver.save(sess, filename)
Example #37
0
 def get_pred_input_fn():
   return movielens_dataset.get_input_fn(
       False,
       distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
       ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1)
Example #38
0
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    tf.set_random_seed(flags_obj.seed)
    # Data preprocessing
    tf.logging.info("Data preprocessing...")
    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = len(train_speech_dataset.speech_labels)

    # Use distribution strategy for multi-gpu training
    num_gpus = flags_core.get_num_gpus(flags_obj)
    distribution_strategy = distribution_utils.get_distribution_strategy(
        num_gpus)
    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=flags_obj.model_dir,
                                       config=run_config,
                                       params={
                                           "num_classes": num_classes,
                                       })

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return dataset.input_fn(per_device_batch_size, train_speech_dataset)

    def input_fn_eval():
        return dataset.input_fn(per_device_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1,
                        total_training_cycle)

        # Perform batch_wise dataset shuffling
        train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle(
            train_speech_dataset.entries, cycle_index, flags_obj.sortagrad,
            flags_obj.batch_size)

        estimator.train(input_fn=input_fn_train, hooks=train_hooks)

        # Evaluation
        tf.logging.info("Starting to evaluate...")

        eval_results = evaluate_model(estimator,
                                      eval_speech_dataset.speech_labels,
                                      eval_speech_dataset.entries,
                                      input_fn_eval)

        # Log the WER and CER results.
        benchmark_logger.log_evaluation_result(eval_results)
        tf.logging.info("Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
            cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY]))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(flags_obj.wer_threshold,
                                             eval_results[_WER_KEY]):
            break
 def test_batch_size_with_remainder(self):
     with self.assertRaises(ValueError):
         distribution_utils.per_device_batch_size(147, num_gpus=5)
 def test_batch_size_with_remainder(self):
   with self.assertRaises(ValueError):
       distribution_utils.per_device_batch_size(147, num_gpus=5)
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.
  """
    if flags_obj.enable_eager:
        tf.enable_eager_execution()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, flags_core.get_num_gpus(flags_obj))

    # pylint: disable=protected-access
    if flags_obj.use_synthetic_data:
        input_fn = keras_common.get_synth_input_fn(
            height=imagenet_main.DEFAULT_IMAGE_SIZE,
            width=imagenet_main.DEFAULT_IMAGE_SIZE,
            num_channels=imagenet_main.NUM_CHANNELS,
            num_classes=imagenet_main.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj))
    else:
        input_fn = imagenet_main.input_fn

    train_input_dataset = input_fn(is_training=True,
                                   data_dir=flags_obj.data_dir,
                                   batch_size=per_device_batch_size,
                                   num_epochs=flags_obj.train_epochs,
                                   parse_record_fn=parse_record_keras)

    eval_input_dataset = input_fn(is_training=False,
                                  data_dir=flags_obj.data_dir,
                                  batch_size=per_device_batch_size,
                                  num_epochs=flags_obj.train_epochs,
                                  parse_record_fn=parse_record_keras)

    strategy = distribution_utils.get_distribution_strategy(
        flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

    strategy_scope = keras_common.get_strategy_scope(strategy)

    with strategy_scope:
        optimizer = keras_common.get_optimizer()
        model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES)

        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['sparse_categorical_accuracy'])

    time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
        learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])

    train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
    train_epochs = flags_obj.train_epochs

    if flags_obj.train_steps:
        train_steps = min(flags_obj.train_steps, train_steps)
        train_epochs = 1

    num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        num_eval_steps = None
        validation_data = None

    model.fit(train_input_dataset,
              epochs=train_epochs,
              steps_per_epoch=train_steps,
              callbacks=[time_callback, lr_callback, tensorboard_callback],
              validation_steps=num_eval_steps,
              validation_data=validation_data,
              verbose=1)

    if not flags_obj.skip_eval:
        model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=1)
Example #42
0
def run_mnist(flags_obj):
  """Run MNIST training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  model_helpers.apply_clean(flags_obj)
  model_function = model_fn

  # Get number of GPUs as defined by the --num_gpus flags and the number of
  # GPUs available on the machine.
  num_gpus = flags_core.get_num_gpus(flags_obj)
  multi_gpu = num_gpus > 1

  if multi_gpu:
    # Validate that the batch size can be split into devices.
    distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus)

    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
    # and (2) wrap the optimizer. The first happens here, and (2) happens
    # in the model_fn itself when the optimizer is defined.
    model_function = tf.contrib.estimator.replicate_model_fn(
        model_fn, loss_reduction=tf.losses.Reduction.MEAN,
        devices=["/device:GPU:%d" % d for d in range(num_gpus)])

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
      model_fn=model_function,
      params={
          'data_format': data_format,
          'multi_gpu': multi_gpu
      })

  # Set up training and evaluation input functions.
  def train_input_fn():
    """Prepare data for training."""

    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes use less memory. MNIST is a small
    # enough dataset that we can easily shuffle the full epoch.
    ds = dataset.train(flags_obj.data_dir)

    def invert(image, label):
      return (image * -1.0) + 1.0, label

    def brightness(image, label):
      return tf.image.random_brightness(image, max_delta=0.2), label 

    if INVERT:
      inverted = ds.map(invert)
      ds = ds.concatenate(inverted)

    if BRIGHTNESS:
      ds = ds.concatenate(ds.map(brightness)) 

    ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

    # Iterate through the dataset a set number (`epochs_between_evals`) of times
    # during each training session.
    ds = ds.repeat(flags_obj.epochs_between_evals)
    return ds

  def eval_input_fn():
    return dataset.test(flags_obj.data_dir).batch(
        flags_obj.batch_size).make_one_shot_iterator().get_next()

  # Set up hook that outputs training logs every 100 steps.
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      batch_size=flags_obj.batch_size)

  # Train and evaluate model.
  for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('\nEvaluation results:\n\t%s\n' % eval_results)

    if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                         eval_results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    image = tf.placeholder(tf.float32, [None, 28, 28])
    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
        'image': image,
    })
    mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)

  def our_test_fn():
    images = []
    for i in list(range(1,10)) + ['dog']:
      images.append(np.array(imageio.imread('{}.png'.format(i)).ravel()/255.0, dtype='float32'))
    images = np.array(images)
    return tf.convert_to_tensor(images)

  # Check our own examples
  predictions = mnist_classifier.predict(input_fn=our_test_fn)
  table = []
  for i in list(range(1, 10)) + ['dog']:
    prediction = next(predictions)
    if i == 'dog':
      print("{}. CNN thinks it's a {} ({:.1f}%)".format(i, prediction['classes'], prediction['probabilities'][prediction['classes']]*100))
    else:
      print("{} at {:.1f}. CNN thinks it's a {} ({:.1f}%)".format(i, prediction['probabilities'][i]*100, prediction['classes'], prediction['probabilities'][prediction['classes']]*100))
    table.append((i, prediction['probabilities']))
Example #43
0
def run_mnist(flags_obj):
  """Run MNIST training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.
  """
  model_helpers.apply_clean(flags_obj)
  model_function = model_fn

  # Get number of GPUs as defined by the --num_gpus flags and the number of
  # GPUs available on the machine.
  num_gpus = flags_core.get_num_gpus(flags_obj)
  multi_gpu = num_gpus > 1

  if multi_gpu:
    # Validate that the batch size can be split into devices.
    distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus)

    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
    # and (2) wrap the optimizer. The first happens here, and (2) happens
    # in the model_fn itself when the optimizer is defined.
    model_function = tf.contrib.estimator.replicate_model_fn(
        model_fn, loss_reduction=tf.losses.Reduction.MEAN,
        devices=["/device:GPU:%d" % d for d in range(num_gpus)])

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
      model_fn=model_function,
      model_dir=flags_obj.model_dir,
      params={
          'data_format': data_format,
          'multi_gpu': multi_gpu
      })

  # Set up training and evaluation input functions.
  def train_input_fn():
    """Prepare data for training."""

    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes use less memory. MNIST is a small
    # enough dataset that we can easily shuffle the full epoch.
    ds = dataset.train(flags_obj.data_dir)
    ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size)

    # Iterate through the dataset a set number (`epochs_between_evals`) of times
    # during each training session.
    ds = ds.repeat(flags_obj.epochs_between_evals)
    return ds

  def eval_input_fn():
    return dataset.test(flags_obj.data_dir).batch(
        flags_obj.batch_size).make_one_shot_iterator().get_next()

  # Set up hook that outputs training logs every 100 steps.
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks, model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  # Train and evaluate model.
  for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('\nEvaluation results:\n\t%s\n' % eval_results)

    if model_helpers.past_stop_threshold(flags_obj.stop_threshold,
                                         eval_results['accuracy']):
      break

  # Export the model
  if flags_obj.export_dir is not None:
    image = tf.placeholder(tf.float32, [None, 28, 28])
    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
        'image': image,
    })
    mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)
def run_deep_speech(_):
    """Run deep speech training and eval loop."""
    # Data preprocessing
    # The file name of training and test dataset
    tf.logging.info("Data preprocessing...")

    train_speech_dataset = generate_dataset(flags_obj.train_data_dir)
    eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir)

    # Number of label classes. Label string is "[a-z]' -"
    num_classes = len(train_speech_dataset.speech_labels)

    # Input shape of each data example:
    # [time_steps (T), feature_bins(F), channel(C)]
    # Channel is set as 1 by default.
    input_shape = (None, train_speech_dataset.num_feature_bins, 1)

    # Create deep speech model and convert it to Estimator
    tf.logging.info("Creating Estimator from Keras model...")
    keras_model = deep_speech_model.DeepSpeech(
        input_shape, flags_obj.rnn_hidden_layers, flags_obj.rnn_type,
        flags_obj.is_bidirectional, flags_obj.rnn_hidden_size,
        flags_obj.rnn_activation, num_classes, flags_obj.use_bias)

    # Convert to estimator
    num_gpus = flags_core.get_num_gpus(flags_obj)
    estimator = convert_keras_to_estimator(keras_model, num_gpus)

    # Benchmark logging
    run_params = {
        "batch_size": flags_obj.batch_size,
        "train_epochs": flags_obj.train_epochs,
        "rnn_hidden_size": flags_obj.rnn_hidden_size,
        "rnn_hidden_layers": flags_obj.rnn_hidden_layers,
        "rnn_activation": flags_obj.rnn_activation,
        "rnn_type": flags_obj.rnn_type,
        "is_bidirectional": flags_obj.is_bidirectional,
        "use_bias": flags_obj.use_bias
    }

    dataset_name = "LibriSpeech"
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info("deep_speech",
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               batch_size=flags_obj.batch_size)

    per_device_batch_size = distribution_utils.per_device_batch_size(
        flags_obj.batch_size, num_gpus)

    def input_fn_train():
        return dataset.input_fn(per_device_batch_size, train_speech_dataset)

    def input_fn_eval():  # #pylint: disable=unused-variable
        return dataset.input_fn(per_device_batch_size, eval_speech_dataset)

    total_training_cycle = (flags_obj.train_epochs //
                            flags_obj.epochs_between_evals)
    for cycle_index in range(total_training_cycle):
        tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1,
                        total_training_cycle)

        estimator.train(input_fn=input_fn_train, hooks=train_hooks)

        # Evaluate (TODO)
        # tf.logging.info("Starting to evaluate.")

        # eval_results = evaluate_model(
        #     estimator, keras_model, data_set.speech_labels, [], input_fn_eval)

        # benchmark_logger.log_evaluation_result(eval_results)
        # If some evaluation threshold is met
        # Log the HR and NDCG results.
        # wer = eval_results[_WER_KEY]
        # cer = eval_results[_CER_KEY]
        # tf.logging.info(
        #     "Iteration {}: WER = {:.2f}, CER = {:.2f}".format(
        #         cycle_index + 1, wer, cer))
        # if model_helpers.past_stop_threshold(FLAGS.wer_threshold, wer):
        #   break

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()