コード例 #1
0
class MNISTTest(tf.test.TestCase):
    """Run tests for MNIST eager loop.

  MNIST eager uses contrib and will not work with TF 2.0.  All tests are
  disabled if using TF 2.0.
  """
    def setUp(self):
        if not keras_utils.is_v2_0():
            tf.compat.v1.enable_v2_behavior()
        super(MNISTTest, self).setUp()

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_train(self):
        train(defun=False)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_evaluate(self):
        evaluate(defun=False)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_train_with_defun(self):
        train(defun=True)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_evaluate_with_defun(self):
        evaluate(defun=True)
コード例 #2
0
ファイル: movielens_test.py プロジェクト: qa276390/tf-models
class BaseTest(tf.test.TestCase):
    """Tests for Wide Deep model."""
    @classmethod
    def setUpClass(cls):  # pylint: disable=invalid-name
        super(BaseTest, cls).setUpClass()
        movielens_main.define_movie_flags()

    def setUp(self):
        # Create temporary CSV file
        self.temp_dir = self.get_temp_dir()
        tf.io.gfile.makedirs(os.path.join(self.temp_dir, movielens.ML_1M))

        self.ratings_csv = os.path.join(self.temp_dir, movielens.ML_1M,
                                        movielens.RATINGS_FILE)
        self.item_csv = os.path.join(self.temp_dir, movielens.ML_1M,
                                     movielens.MOVIES_FILE)

        with tf.io.gfile.GFile(self.ratings_csv, "w") as f:
            f.write(TEST_RATING_DATA)

        with tf.io.gfile.GFile(self.item_csv, "w") as f:
            f.write(TEST_ITEM_DATA)

    @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
    def test_input_fn(self):
        train_input_fn, _, _ = movielens_dataset.construct_input_fns(
            dataset=movielens.ML_1M,
            data_dir=self.temp_dir,
            batch_size=8,
            repeat=1)

        dataset = train_input_fn()
        features, labels = dataset.make_one_shot_iterator().get_next()

        with self.session() as sess:
            features, labels = sess.run((features, labels))

            # Compare the two features dictionaries.
            for key in TEST_INPUT_VALUES:
                self.assertTrue(key in features)
                self.assertAllClose(TEST_INPUT_VALUES[key], features[key][0])

            self.assertAllClose(labels[0], [1.0])

    @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
    def test_end_to_end_deep(self):
        integration.run_synthetic(main=movielens_main.main,
                                  tmp_root=self.temp_dir,
                                  extra_flags=[
                                      "--data_dir", self.temp_dir,
                                      "--download_if_missing=false",
                                      "--train_epochs", "1",
                                      "--epochs_between_evals", "1"
                                  ],
                                  synth=False,
                                  max_train=None)
コード例 #3
0
 def benchmark_1_gpu_no_cudnn(self):
     """Benchmark 1 gpu with CuDNN disabled."""
     self._setup()
     FLAGS.num_gpus = 1
     FLAGS.batch_size = 64
     FLAGS.cudnn = False
     FLAGS.enable_eager = keras_utils.is_v2_0()
     self._run_and_report_benchmark()
コード例 #4
0
 def benchmark_xla_8_gpu_no_cudnn(self):
     """Benchmark 8 gpu w/xla and CuDNN disabled."""
     self._setup()
     FLAGS.num_gpus = 8
     FLAGS.batch_size = 64 * 8
     FLAGS.log_steps = 10
     FLAGS.cudnn = False
     FLAGS.enable_eager = keras_utils.is_v2_0()
     FLAGS.enable_xla = True
     self._run_and_report_benchmark()
コード例 #5
0
    def test_collect_run_params(self):
        run_info = {}
        run_parameters = {
            "batch_size": 32,
            "synthetic_data": True,
            "train_epochs": 100.00,
            "dtype": "fp16",
            "resnet_size": 50,
            "random_tensor": tf.constant(2.0)
        }
        logger._collect_run_params(run_info, run_parameters)
        self.assertEqual(len(run_info["run_parameters"]), 6)
        self.assertEqual(run_info["run_parameters"][0], {
            "name": "batch_size",
            "long_value": 32
        })
        self.assertEqual(run_info["run_parameters"][1], {
            "name": "dtype",
            "string_value": "fp16"
        })
        if keras_utils.is_v2_0():
            self.assertEqual(
                run_info["run_parameters"][2], {
                    "name": "random_tensor",
                    "string_value": "tf.Tensor(2.0, shape=(), dtype=float32)"
                })
        else:
            self.assertEqual(
                run_info["run_parameters"][2], {
                    "name": "random_tensor",
                    "string_value":
                    "Tensor(\"Const:0\", shape=(), dtype=float32)"
                })

        self.assertEqual(run_info["run_parameters"][3], {
            "name": "resnet_size",
            "long_value": 50
        })
        self.assertEqual(run_info["run_parameters"][4], {
            "name": "synthetic_data",
            "bool_value": "True"
        })
        self.assertEqual(run_info["run_parameters"][5], {
            "name": "train_epochs",
            "float_value": 100.00
        })
コード例 #6
0
def build_model(vocab_size,
                embedding_dim=EMBEDDING_DIM,
                rnn_units=RNN_UNITS,
                batch_size=None,
                stateful=False,
                use_cudnn=True):
    """Builds the Shakespeare model.

  Args:
    vocab_size: The number of character classes in the input.
    embedding_dim: The dimension of the embedding space for each class.
    rnn_units: The number of RNN units in the layer.
    batch_size: When predicting, the batch size of the predictions.
    stateful: If true, the LSTM is stateful.

  Returns:
    A Keras Model.
  """
    # In V1 there is a separate class for CuDNN. In V2 the LSTM class will use
    # CuDNN automatically if applicable.
    if use_cudnn and not keras_utils.is_v2_0():
        LSTM = tf.compat.v1.CuDNNLSTM
    else:
        # The LSTM call was rewritten to be more efficient in 2.0. However because
        # we want to compare the performance of the two runtimes, we force both
        # V1 and V2 to use the more efficient implementation.
        LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)

    # By indirecting the activation through a lambda layer, the logic to dispatch
    # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
    # mode.
    lstm_activation = ('tanh' if use_cudnn else lambda x: tf.math.tanh(x))

    batch_shape = [batch_size if stateful else None, None]
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,
                                  embedding_dim,
                                  batch_input_shape=batch_shape),
        LSTM(rnn_units,
             activation=lstm_activation,
             return_sequences=True,
             stateful=stateful,
             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
コード例 #7
0
    def preprocess_train_input(features, labels):
        """Pre-process the training data.

    This is needed because
    - The label needs to be extended to be used in the loss fn
    - We need the same inputs for training and eval so adding fake inputs
      for DUPLICATE_MASK in training data.
    """
        labels = tf.expand_dims(labels, -1)
        fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
        features[rconst.DUPLICATE_MASK] = fake_dup_mask
        features[rconst.TRAIN_LABEL_KEY] = labels

        if params["distribute_strategy"] or not keras_utils.is_v2_0():
            return features
        else:
            # b/134708104
            return (features, )
コード例 #8
0
ファイル: ncf_keras_main.py プロジェクト: youikim/models
  def preprocess_eval_input(features):
    """Pre-process the eval data.

    This is needed because:
    - The label needs to be extended to be used in the loss fn
    - We need the same inputs for training and eval so adding fake inputs
      for VALID_PT_MASK in eval data.
    """
    labels = tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
    labels = tf.expand_dims(labels, -1)
    fake_valid_pt_mask = tf.cast(
        tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
    features[rconst.VALID_POINT_MASK] = fake_valid_pt_mask
    features[rconst.TRAIN_LABEL_KEY] = labels

    if params["distribute_strategy"] or not keras_utils.is_v2_0():
      return features
    else:
      # b/134708104
      return (features,)
コード例 #9
0
def build_model(vocab_size,
                embedding_dim=EMBEDDING_DIM,
                rnn_units=RNN_UNITS,
                batch_size=None,
                stateful=False,
                use_cudnn=True):
    """Builds the Shakespeare model.

  Args:
    vocab_size: The number of character classes in the input.
    embedding_dim: The dimension of the embedding space for each class.
    rnn_units: The number of RNN units in the layer.
    batch_size: When predicting, the batch size of the predictions.
    stateful: If true, the LSTM is stateful.

  Returns:
    A Keras Model.
  """
    assert keras_utils.is_v2_0()
    LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)

    # By indirecting the activation through a lambda layer, the logic to dispatch
    # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
    # mode.
    lstm_activation = ('tanh' if use_cudnn else lambda x: tf.math.tanh(x))

    batch_shape = [batch_size if stateful else None, None]
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,
                                  embedding_dim,
                                  batch_input_shape=batch_shape),
        LSTM(rnn_units,
             activation=lstm_activation,
             return_sequences=True,
             stateful=stateful,
             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
        tf.keras.layers.Softmax(dtype=tf.float32)
    ])
コード例 #10
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    params = ncf_common.parse_flags(FLAGS)
    model_helpers.apply_clean(flags.FLAGS)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        tpu_address=FLAGS.tpu)
    params["distribute_strategy"] = strategy

    if not keras_utils.is_v2_0() and strategy is not None:
        logging.error(
            "NCF Keras only works with distribution strategy in TF 2.0")
        return
    if (params["keras_use_ctl"]
            and (not keras_utils.is_v2_0() or strategy is None)):
        logging.error(
            "Custom training loop only works with tensorflow 2.0 and dist strat."
        )
        return
    if params["use_tpu"] and not params["keras_use_ctl"]:
        logging.error(
            "Custom training loop must be used when using TPUStrategy.")
        return

    batch_size = params["batch_size"]
    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        # Start data producing thread.
        num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)

    use_remote_tpu = params["use_tpu"] and FLAGS.tpu
    primary_cpu_task = tpu_lib.get_primary_cpu_task(use_remote_tpu)

    with tf.device(primary_cpu_task):
        (train_input_dataset, eval_input_dataset,
         num_train_steps, num_eval_steps) = \
          (ncf_input_pipeline.create_ncf_input_data(
              params, producer, input_meta_data, strategy))
        steps_per_epoch = None if generate_input_online else num_train_steps

        with distribution_utils.get_strategy_scope(strategy):
            keras_model = _get_keras_model(params)
            optimizer = tf.keras.optimizers.Adam(
                learning_rate=params["learning_rate"],
                beta_1=params["beta1"],
                beta_2=params["beta2"],
                epsilon=params["epsilon"])
            if FLAGS.dtype == "fp16":
                optimizer = \
                  tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                      optimizer,
                      loss_scale=flags_core.get_loss_scale(FLAGS,
                                                           default_for_fp16="dynamic"))

            if params["keras_use_ctl"]:
                train_loss, eval_results = run_ncf_custom_training(
                    params,
                    strategy,
                    keras_model,
                    optimizer,
                    callbacks,
                    train_input_dataset,
                    eval_input_dataset,
                    num_train_steps,
                    num_eval_steps,
                    generate_input_online=generate_input_online)
            else:
                # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
                # a valid arg for this model. Also remove as a valid flag.
                if FLAGS.force_v2_in_keras_compile is not None:
                    keras_model.compile(optimizer=optimizer,
                                        run_eagerly=FLAGS.run_eagerly,
                                        experimental_run_tf_function=FLAGS.
                                        force_v2_in_keras_compile)
                else:
                    keras_model.compile(optimizer=optimizer,
                                        run_eagerly=FLAGS.run_eagerly)

                history = keras_model.fit(train_input_dataset,
                                          epochs=FLAGS.train_epochs,
                                          steps_per_epoch=steps_per_epoch,
                                          callbacks=callbacks,
                                          validation_data=eval_input_dataset,
                                          validation_steps=num_eval_steps,
                                          verbose=2)

                logging.info("Training done. Start evaluating")

                eval_loss_and_metrics = keras_model.evaluate(
                    eval_input_dataset, steps=num_eval_steps, verbose=2)

                logging.info("Keras evaluation is done.")

                # Keras evaluate() API returns scalar loss and metric values from
                # evaluation as a list. Here, the returned list would contain
                # [evaluation loss, hr sum, hr count].
                eval_hit_rate = eval_loss_and_metrics[
                    1] / eval_loss_and_metrics[2]

                # Format evaluation result into [eval loss, eval hit accuracy].
                eval_results = [eval_loss_and_metrics[0], eval_hit_rate]

                if history and history.history:
                    train_history = history.history
                    train_loss = train_history["loss"][-1]

        stats = build_stats(train_loss, eval_results, time_callback)
        return stats
コード例 #11
0
def run(flags_obj):
  """Run ResNet ImageNet training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
  keras_utils.set_session_config(
      enable_eager=flags_obj.enable_eager,
      enable_xla=flags_obj.enable_xla)

  # Execute flag override logic for better model performance
  if flags_obj.tf_gpu_thread_mode:
    common.set_gpu_thread_mode_and_count(flags_obj)
  common.set_cudnn_batchnorm_mode()

  dtype = flags_core.get_tf_dtype(flags_obj)
  if dtype == tf.float16:
    loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
        'mixed_float16', loss_scale=loss_scale)
    tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
    if not keras_utils.is_v2_0():
      raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.')
  elif dtype == tf.bfloat16:
    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
        'mixed_bfloat16')
    tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  tf.keras.backend.set_image_data_format(data_format)

  # Configures cluster spec for distribution strategy.
  num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
                                                     flags_obj.task_index)

  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=flags_obj.distribution_strategy,
      num_gpus=flags_obj.num_gpus,
      num_workers=num_workers,
      all_reduce_alg=flags_obj.all_reduce_alg,
      num_packs=flags_obj.num_packs,
      tpu_address=flags_obj.tpu)

  if strategy:
    # flags_obj.enable_get_next_as_optional controls whether enabling
    # get_next_as_optional behavior in DistributedIterator. If true, last
    # partial batch can be supported.
    strategy.extended.experimental_enable_get_next_as_optional = (
        flags_obj.enable_get_next_as_optional
    )

  strategy_scope = distribution_utils.get_strategy_scope(strategy)

  # pylint: disable=protected-access
  if flags_obj.use_synthetic_data:
    distribution_utils.set_up_synthetic_data()
    input_fn = common.get_synth_input_fn(
        height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
        width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
        num_channels=imagenet_preprocessing.NUM_CHANNELS,
        num_classes=imagenet_preprocessing.NUM_CLASSES,
        dtype=dtype,
        drop_remainder=True)
  else:
    distribution_utils.undo_set_up_synthetic_data()
    input_fn = imagenet_preprocessing.input_fn

  # When `enable_xla` is True, we always drop the remainder of the batches
  # in the dataset, as XLA-GPU doesn't support dynamic shapes.
  drop_remainder = flags_obj.enable_xla

  train_input_dataset = input_fn(
      is_training=True,
      data_dir=flags_obj.data_dir,
      batch_size=flags_obj.batch_size,
      num_epochs=flags_obj.train_epochs,
      parse_record_fn=imagenet_preprocessing.parse_record,
      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
      dtype=dtype,
      drop_remainder=drop_remainder,
      tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
      training_dataset_cache=flags_obj.training_dataset_cache,
  )

  eval_input_dataset = None
  if not flags_obj.skip_eval:
    eval_input_dataset = input_fn(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
        num_epochs=flags_obj.train_epochs,
        parse_record_fn=imagenet_preprocessing.parse_record,
        dtype=dtype,
        drop_remainder=drop_remainder)

  lr_schedule = 0.1
  if flags_obj.use_tensor_lr:
    lr_schedule = common.PiecewiseConstantDecayWithWarmup(
        batch_size=flags_obj.batch_size,
        epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
        warmup_epochs=common.LR_SCHEDULE[0][1],
        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)

  with strategy_scope:
    optimizer = common.get_optimizer(lr_schedule)
    if flags_obj.fp16_implementation == 'graph_rewrite':
      # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
      # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
      # which will ensure tf.compat.v2.keras.mixed_precision and
      # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
      # up.
      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
          optimizer)

    # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
    if flags_obj.use_trivial_model:
      model = trivial_model.trivial_model(
          imagenet_preprocessing.NUM_CLASSES)
    else:
      model = resnet_model.resnet50(
          num_classes=imagenet_preprocessing.NUM_CLASSES)

    # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
    # a valid arg for this model. Also remove as a valid flag.
    if flags_obj.force_v2_in_keras_compile is not None:
      model.compile(
          loss='sparse_categorical_crossentropy',
          optimizer=optimizer,
          metrics=(['sparse_categorical_accuracy']
                   if flags_obj.report_accuracy_metrics else None),
          run_eagerly=flags_obj.run_eagerly,
          experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)
    else:
      model.compile(
          loss='sparse_categorical_crossentropy',
          optimizer=optimizer,
          metrics=(['sparse_categorical_accuracy']
                   if flags_obj.report_accuracy_metrics else None),
          run_eagerly=flags_obj.run_eagerly)

  steps_per_epoch = (
      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
  train_epochs = flags_obj.train_epochs

  callbacks = common.get_callbacks(steps_per_epoch,
                                   common.learning_rate_schedule)
  if flags_obj.enable_checkpoint_and_export:
    ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}')
    callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
                                                        save_weights_only=True))

  # if mutliple epochs, ignore the train_steps flag.
  if train_epochs <= 1 and flags_obj.train_steps:
    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
    train_epochs = 1

  num_eval_steps = (
      imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)

  validation_data = eval_input_dataset
  if flags_obj.skip_eval:
    # Only build the training graph. This reduces memory usage introduced by
    # control flow ops in layers that have different implementations for
    # training and inference (e.g., batch norm).
    if flags_obj.set_learning_phase_to_train:
      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
      # not using distribution strategy.
      tf.keras.backend.set_learning_phase(1)
    num_eval_steps = None
    validation_data = None

  if not strategy and flags_obj.explicit_gpu_placement:
    # TODO(b/135607227): Add device scope automatically in Keras training loop
    # when not using distribition strategy.
    no_dist_strat_device = tf.device('/device:GPU:0')
    no_dist_strat_device.__enter__()

  history = model.fit(train_input_dataset,
                      epochs=train_epochs,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=callbacks,
                      validation_steps=num_eval_steps,
                      validation_data=validation_data,
                      validation_freq=flags_obj.epochs_between_evals,
                      verbose=2)
  if flags_obj.enable_checkpoint_and_export:
    if dtype == tf.bfloat16:
      logging.warning("Keras model.save does not support bfloat16 dtype.")
    else:
      # Keras model.save assumes a float32 input designature.
      export_path = os.path.join(flags_obj.model_dir, 'saved_model')
      model.save(export_path, include_optimizer=False)

  eval_output = None
  if not flags_obj.skip_eval:
    eval_output = model.evaluate(eval_input_dataset,
                                 steps=num_eval_steps,
                                 verbose=2)

  if not strategy and flags_obj.explicit_gpu_placement:
    no_dist_strat_device.__exit__()

  stats = common.build_stats(history, eval_output, callbacks)
  return stats
コード例 #12
0
 def setUp(self):
     if not keras_utils.is_v2_0():
         tf.compat.v1.enable_v2_behavior()
     super(MNISTTest, self).setUp()
コード例 #13
0
class Tests(tf.test.TestCase):
    """Run tests for MNIST model.

  MNIST uses contrib and will not work with TF 2.0.  All tests are disabled if
  using TF 2.0.
  """
    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_mnist(self):
        classifier = make_estimator()
        classifier.train(input_fn=dummy_input_fn, steps=2)
        eval_results = classifier.evaluate(input_fn=dummy_input_fn, steps=1)

        loss = eval_results['loss']
        global_step = eval_results['global_step']
        accuracy = eval_results['accuracy']
        self.assertEqual(loss.shape, ())
        self.assertEqual(2, global_step)
        self.assertEqual(accuracy.shape, ())

        input_fn = lambda: tf.random.uniform([3, 784])
        predictions_generator = classifier.predict(input_fn)
        for _ in range(3):
            predictions = next(predictions_generator)
            self.assertEqual(predictions['probabilities'].shape, (10, ))
            self.assertEqual(predictions['classes'].shape, ())

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def mnist_model_fn_helper(self, mode, multi_gpu=False):
        features, labels = dummy_input_fn()
        image_count = features.shape[0]
        spec = mnist.model_fn(features, labels, mode, {
            'data_format': 'channels_last',
            'multi_gpu': multi_gpu
        })

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = spec.predictions
            self.assertAllEqual(predictions['probabilities'].shape,
                                (image_count, 10))
            self.assertEqual(predictions['probabilities'].dtype, tf.float32)
            self.assertAllEqual(predictions['classes'].shape, (image_count, ))
            self.assertEqual(predictions['classes'].dtype, tf.int64)

        if mode != tf.estimator.ModeKeys.PREDICT:
            loss = spec.loss
            self.assertAllEqual(loss.shape, ())
            self.assertEqual(loss.dtype, tf.float32)

        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = spec.eval_metric_ops
            self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
            self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
            self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
            self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_mnist_model_fn_train_mode(self):
        self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_mnist_model_fn_train_mode_multi_gpu(self):
        self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN, multi_gpu=True)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_mnist_model_fn_eval_mode(self):
        self.mnist_model_fn_helper(tf.estimator.ModeKeys.EVAL)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_mnist_model_fn_predict_mode(self):
        self.mnist_model_fn_helper(tf.estimator.ModeKeys.PREDICT)
コード例 #14
0
ファイル: census_test.py プロジェクト: luojiawei70/demo
class BaseTest(tf.test.TestCase):
    """Tests for Wide Deep model."""
    @classmethod
    def setUpClass(cls):  # pylint: disable=invalid-name
        super(BaseTest, cls).setUpClass()
        census_main.define_census_flags()

    def setUp(self):
        # Create temporary CSV file
        self.temp_dir = self.get_temp_dir()
        self.input_csv = os.path.join(self.temp_dir, 'test.csv')
        with tf.io.gfile.GFile(self.input_csv, 'w') as temp_csv:
            temp_csv.write(TEST_INPUT)

        with tf.io.gfile.GFile(TEST_CSV, 'r') as temp_csv:
            test_csv_contents = temp_csv.read()

        # Used for end-to-end tests.
        for fname in [census_dataset.TRAINING_FILE, census_dataset.EVAL_FILE]:
            with tf.io.gfile.GFile(os.path.join(self.temp_dir, fname),
                                   'w') as test_csv:
                test_csv.write(test_csv_contents)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_input_fn(self):
        dataset = census_dataset.input_fn(self.input_csv, 1, False, 1)
        features, labels = dataset.make_one_shot_iterator().get_next()

        with self.test_session() as sess:
            features, labels = sess.run((features, labels))

            # Compare the two features dictionaries.
            for key in TEST_INPUT_VALUES:
                self.assertTrue(key in features)
                self.assertEqual(len(features[key]), 1)
                feature_value = features[key][0]

                # Convert from bytes to string for Python 3.
                if isinstance(feature_value, bytes):
                    feature_value = feature_value.decode()

                self.assertEqual(TEST_INPUT_VALUES[key], feature_value)

            self.assertFalse(labels)

    def build_and_test_estimator(self, model_type):
        """Ensure that model trains and minimizes loss."""
        model = census_main.build_estimator(
            self.temp_dir,
            model_type,
            model_column_fn=census_dataset.build_model_columns,
            inter_op=0,
            intra_op=0)

        # Train for 1 step to initialize model and evaluate initial loss
        def get_input_fn(num_epochs, shuffle, batch_size):
            def input_fn():
                return census_dataset.input_fn(TEST_CSV,
                                               num_epochs=num_epochs,
                                               shuffle=shuffle,
                                               batch_size=batch_size)

            return input_fn

        model.train(input_fn=get_input_fn(1, True, 1), steps=1)
        initial_results = model.evaluate(input_fn=get_input_fn(1, False, 1))

        # Train for 100 epochs at batch size 3 and evaluate final loss
        model.train(input_fn=get_input_fn(100, True, 3))
        final_results = model.evaluate(input_fn=get_input_fn(1, False, 1))

        print('%s initial results:' % model_type, initial_results)
        print('%s final results:' % model_type, final_results)

        # Ensure loss has decreased, while accuracy and both AUCs have increased.
        self.assertLess(final_results['loss'], initial_results['loss'])
        self.assertGreater(final_results['auc'], initial_results['auc'])
        self.assertGreater(final_results['auc_precision_recall'],
                           initial_results['auc_precision_recall'])
        self.assertGreater(final_results['accuracy'],
                           initial_results['accuracy'])

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_wide_deep_estimator_training(self):
        self.build_and_test_estimator('wide_deep')

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_end_to_end_wide(self):
        integration.run_synthetic(main=census_main.main,
                                  tmp_root=self.get_temp_dir(),
                                  extra_flags=[
                                      '--data_dir',
                                      self.get_temp_dir(), '--model_type',
                                      'wide', '--download_if_missing=false'
                                  ],
                                  synth=False)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_end_to_end_deep(self):
        integration.run_synthetic(main=census_main.main,
                                  tmp_root=self.get_temp_dir(),
                                  extra_flags=[
                                      '--data_dir',
                                      self.get_temp_dir(), '--model_type',
                                      'deep', '--download_if_missing=false'
                                  ],
                                  synth=False)

    @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
    def test_end_to_end_wide_deep(self):
        integration.run_synthetic(main=census_main.main,
                                  tmp_root=self.get_temp_dir(),
                                  extra_flags=[
                                      '--data_dir',
                                      self.get_temp_dir(), '--model_type',
                                      'wide_deep',
                                      '--download_if_missing=false'
                                  ],
                                  synth=False)
コード例 #15
0
class NcfTest(tf.test.TestCase):

  @classmethod
  def setUpClass(cls):  # pylint: disable=invalid-name
    super(NcfTest, cls).setUpClass()
    ncf_common.define_ncf_flags()

  def setUp(self):
    self.top_k_old = rconst.TOP_K
    self.num_eval_negatives_old = rconst.NUM_EVAL_NEGATIVES
    rconst.NUM_EVAL_NEGATIVES = 2

  def tearDown(self):
    rconst.NUM_EVAL_NEGATIVES = self.num_eval_negatives_old
    rconst.TOP_K = self.top_k_old

  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
  def get_hit_rate_and_ndcg(self, predicted_scores_by_user, items_by_user,
                            top_k=rconst.TOP_K, match_mlperf=False):
    rconst.TOP_K = top_k
    rconst.NUM_EVAL_NEGATIVES = predicted_scores_by_user.shape[1] - 1
    batch_size = items_by_user.shape[0]

    users = np.repeat(np.arange(batch_size)[:, np.newaxis],
                      rconst.NUM_EVAL_NEGATIVES + 1, axis=1)
    users, items, duplicate_mask = \
      data_pipeline.BaseDataConstructor._assemble_eval_batch(
          users, items_by_user[:, -1:], items_by_user[:, :-1], batch_size)

    g = tf.Graph()
    with g.as_default():
      logits = tf.convert_to_tensor(
          predicted_scores_by_user.reshape((-1, 1)), tf.float32)
      softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
                                  logits], axis=1)
      duplicate_mask = tf.convert_to_tensor(duplicate_mask, tf.float32)

      metric_ops = neumf_model._get_estimator_spec_with_metrics(
          logits=logits, softmax_logits=softmax_logits,
          duplicate_mask=duplicate_mask, num_training_neg=NUM_TRAIN_NEG,
          match_mlperf=match_mlperf).eval_metric_ops

      hr = metric_ops[rconst.HR_KEY]
      ndcg = metric_ops[rconst.NDCG_KEY]

      init = [tf.compat.v1.global_variables_initializer(),
              tf.compat.v1.local_variables_initializer()]

    with self.session(graph=g) as sess:
      sess.run(init)
      return sess.run([hr[1], ndcg[1]])

  def test_hit_rate_and_ndcg(self):
    # Test with no duplicate items
    predictions = np.array([
        [2., 0., 1.],  # In top 2
        [1., 0., 2.],  # In top 1
        [2., 1., 0.],  # In top 3
        [3., 4., 2.]   # In top 3
    ])
    items = np.array([
        [2, 3, 1],
        [3, 1, 2],
        [2, 1, 3],
        [1, 3, 2],
    ])

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
    self.assertAlmostEqual(hr, 1 / 4)
    self.assertAlmostEqual(ndcg, 1 / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2)
    self.assertAlmostEqual(hr, 2 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3)
    self.assertAlmostEqual(hr, 4 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 1 / 4)
    self.assertAlmostEqual(ndcg, 1 / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 2 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 4 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)

    # Test with duplicate items. In the MLPerf case, we treat the duplicates as
    # a single item. Otherwise, we treat the duplicates as separate items.
    predictions = np.array([
        [2., 2., 3., 1.],  # In top 4. MLPerf: In top 3
        [1., 0., 2., 3.],  # In top 1. MLPerf: In top 1
        [2., 3., 2., 0.],  # In top 4. MLPerf: In top 3
        [2., 4., 2., 3.]   # In top 2. MLPerf: In top 2
    ])
    items = np.array([
        [2, 2, 3, 1],
        [2, 3, 4, 1],
        [2, 3, 2, 1],
        [3, 2, 1, 4],
    ])
    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
    self.assertAlmostEqual(hr, 1 / 4)
    self.assertAlmostEqual(ndcg, 1 / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2)
    self.assertAlmostEqual(hr, 2 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3)
    self.assertAlmostEqual(hr, 2 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 4)
    self.assertAlmostEqual(hr, 4 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(5)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 1 / 4)
    self.assertAlmostEqual(ndcg, 1 / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 2 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 4 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 4,
                                          match_mlperf=True)
    self.assertAlmostEqual(hr, 4 / 4)
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)

  _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1']

  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_estimator(self):
    integration.run_synthetic(
        ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS)

  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_estimator_mlperf(self):
    integration.run_synthetic(
        ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-ml_perf', 'True'])

  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_keras_no_dist_strat(self):
    integration.run_synthetic(
        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS +
        ['-distribution_strategy', 'off'])

  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
  def test_end_to_end_keras_dist_strat(self):
    integration.run_synthetic(
        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'])

  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
  def test_end_to_end_keras_dist_strat_ctl(self):
    flags = (self._BASE_END_TO_END_FLAGS +
             ['-num_gpus', '0'] +
             ['-keras_use_ctl', 'True'])
    integration.run_synthetic(
        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=flags)

  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
  def test_end_to_end_keras_1_gpu_dist_strat(self):
    if context.num_gpus() < 1:
      self.skipTest(
          "{} GPUs are not available for this test. {} GPUs are available".
          format(1, context.num_gpus()))

    integration.run_synthetic(
        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1'])

  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
  def test_end_to_end_keras_2_gpu(self):
    if context.num_gpus() < 2:
      self.skipTest(
          "{} GPUs are not available for this test. {} GPUs are available".
          format(2, context.num_gpus()))

    integration.run_synthetic(
        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '2'])
コード例 #16
0
 def setUp(self):
     if keras_utils.is_v2_0():
         tf.compat.v1.disable_eager_execution()
     super(GoldenBaseTest, self).setUp()
コード例 #17
0
class GoldenBaseTest(reference_data.BaseTest):
    """Class to ensure that reference data testing runs properly."""
    def setUp(self):
        if keras_utils.is_v2_0():
            tf.compat.v1.disable_eager_execution()
        super(GoldenBaseTest, self).setUp()

    @property
    def test_name(self):
        return "reference_data_test"

    def _uniform_random_ops(self,
                            test=False,
                            wrong_name=False,
                            wrong_shape=False,
                            bad_seed=False,
                            bad_function=False):
        """Tests number generation and failure modes.

    This test is of a very simple graph: the generation of a 1x1 random tensor.
    However, it is also used to confirm that the tests are actually checking
    properly by failing in predefined ways.

    Args:
      test: Whether or not to run as a test case.
      wrong_name: Whether to assign the wrong name to the tensor.
      wrong_shape: Whether to create a tensor with the wrong shape.
      bad_seed: Whether or not to perturb the random seed.
      bad_function: Whether to perturb the correctness function.
    """
        name = "uniform_random"

        g = tf.Graph()
        with g.as_default():
            seed = self.name_to_seed(name)
            seed = seed + 1 if bad_seed else seed
            tf.compat.v1.set_random_seed(seed)
            tensor_name = "wrong_tensor" if wrong_name else "input_tensor"
            tensor_shape = (1, 2) if wrong_shape else (1, 1)
            input_tensor = tf.compat.v1.get_variable(
                tensor_name,
                dtype=tf.float32,
                initializer=tf.random.uniform(tensor_shape, maxval=1))

        def correctness_function(tensor_result):
            result = float(tensor_result[0, 0])
            result = result + 0.1 if bad_function else result
            return [result]

        self._save_or_test_ops(name=name,
                               graph=g,
                               ops_to_eval=[input_tensor],
                               test=test,
                               correctness_function=correctness_function)

    def _dense_ops(self, test=False):
        name = "dense"

        g = tf.Graph()
        with g.as_default():
            tf.compat.v1.set_random_seed(self.name_to_seed(name))
            input_tensor = tf.compat.v1.get_variable(
                "input_tensor",
                dtype=tf.float32,
                initializer=tf.random.uniform((1, 2), maxval=1))
            layer = tf.compat.v1.layers.dense(inputs=input_tensor, units=4)
            layer = tf.compat.v1.layers.dense(inputs=layer, units=1)

        self._save_or_test_ops(
            name=name,
            graph=g,
            ops_to_eval=[layer],
            test=test,
            correctness_function=self.default_correctness_function)

    def test_uniform_random(self):
        self._uniform_random_ops(test=True)

    def test_tensor_name_error(self):
        with self.assertRaises(AssertionError):
            self._uniform_random_ops(test=True, wrong_name=True)

    @unittest.skipIf(keras_utils.is_v2_0(),
                     "TODO:(b/136010138) Fails on TF 2.0.")
    def test_tensor_shape_error(self):
        with self.assertRaises(AssertionError):
            self._uniform_random_ops(test=True, wrong_shape=True)

    def test_incorrectness_function(self):
        with self.assertRaises(AssertionError):
            self._uniform_random_ops(test=True, bad_function=True)

    def test_dense(self):
        self._dense_ops(test=True)

    def regenerate(self):
        self._uniform_random_ops(test=False)
        self._dense_ops(test=False)
コード例 #18
0
def run_ncf(_):
  """Run NCF training and eval with Keras."""

  keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

  if FLAGS.seed is not None:
    print("Setting tf seed")
    tf.random.set_seed(FLAGS.seed)

  model_helpers.apply_clean(FLAGS)

  if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
    policy = tf.keras.mixed_precision.experimental.Policy(
        "mixed_float16",
        loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic"))
    tf.keras.mixed_precision.experimental.set_policy(policy)

  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      tpu_address=FLAGS.tpu)

  params = ncf_common.parse_flags(FLAGS)
  params["distribute_strategy"] = strategy

  if not keras_utils.is_v2_0() and strategy is not None:
    logging.error("NCF Keras only works with distribution strategy in TF 2.0")
    return
  if (params["keras_use_ctl"] and (
      not keras_utils.is_v2_0() or strategy is None)):
    logging.error(
        "Custom training loop only works with tensorflow 2.0 and dist strat.")
    return
  if params["use_tpu"] and not params["keras_use_ctl"]:
    logging.error("Custom training loop must be used when using TPUStrategy.")
    return

  batch_size = params["batch_size"]
  time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
  callbacks = [time_callback]

  producer, input_meta_data = None, None
  generate_input_online = params["train_dataset_path"] is None

  if generate_input_online:
    # Start data producing thread.
    num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
    producer.start()
    per_epoch_callback = IncrementEpochCallback(producer)
    callbacks.append(per_epoch_callback)
  else:
    assert params["eval_dataset_path"] and params["input_meta_data_path"]
    with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
      input_meta_data = json.loads(reader.read().decode("utf-8"))
      num_users = input_meta_data["num_users"]
      num_items = input_meta_data["num_items"]

  params["num_users"], params["num_items"] = num_users, num_items

  if FLAGS.early_stopping:
    early_stopping_callback = CustomEarlyStopping(
        "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
    callbacks.append(early_stopping_callback)

  (train_input_dataset, eval_input_dataset,
   num_train_steps, num_eval_steps) = \
    (ncf_input_pipeline.create_ncf_input_data(
        params, producer, input_meta_data, strategy))
  steps_per_epoch = None if generate_input_online else num_train_steps

  with distribution_utils.get_strategy_scope(strategy):
    keras_model = _get_keras_model(params)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=params["learning_rate"],
        beta_1=params["beta1"],
        beta_2=params["beta2"],
        epsilon=params["epsilon"])
    if FLAGS.fp16_implementation == "graph_rewrite":
      optimizer = \
        tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer,
            loss_scale=flags_core.get_loss_scale(FLAGS,
                                                 default_for_fp16="dynamic"))
    elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]:
      # When keras_use_ctl is False, instead Model.fit() automatically applies
      # loss scaling so we don't need to create a LossScaleOptimizer.
      optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
          optimizer,
          tf.keras.mixed_precision.experimental.global_policy().loss_scale)

    if params["keras_use_ctl"]:
      train_loss, eval_results = run_ncf_custom_training(
          params,
          strategy,
          keras_model,
          optimizer,
          callbacks,
          train_input_dataset,
          eval_input_dataset,
          num_train_steps,
          num_eval_steps,
          generate_input_online=generate_input_online)
    else:
      keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly)

      if not FLAGS.ml_perf:
        # Create Tensorboard summary and checkpoint callbacks.
        summary_dir = os.path.join(FLAGS.model_dir, "summaries")
        summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
        checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint")
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            checkpoint_path, save_weights_only=True)

        callbacks += [summary_callback, checkpoint_callback]

      history = keras_model.fit(
          train_input_dataset,
          epochs=FLAGS.train_epochs,
          steps_per_epoch=steps_per_epoch,
          callbacks=callbacks,
          validation_data=eval_input_dataset,
          validation_steps=num_eval_steps,
          verbose=2)

      logging.info("Training done. Start evaluating")

      eval_loss_and_metrics = keras_model.evaluate(
          eval_input_dataset, steps=num_eval_steps, verbose=2)

      logging.info("Keras evaluation is done.")

      # Keras evaluate() API returns scalar loss and metric values from
      # evaluation as a list. Here, the returned list would contain
      # [evaluation loss, hr sum, hr count].
      eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2]

      # Format evaluation result into [eval loss, eval hit accuracy].
      eval_results = [eval_loss_and_metrics[0], eval_hit_rate]

      if history and history.history:
        train_history = history.history
        train_loss = train_history["loss"][-1]

  stats = build_stats(train_loss, eval_results, time_callback)
  return stats
コード例 #19
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    # TODO(seemuch): Support different train and eval batch sizes
    if FLAGS.eval_batch_size != FLAGS.batch_size:
        logging.warning(
            "The Keras implementation of NCF currently does not support batch_size "
            "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
            "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size))
        FLAGS.eval_batch_size = FLAGS.batch_size

    params = ncf_common.parse_flags(FLAGS)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus)
    params["distribute_strategy"] = strategy

    if (params["keras_use_ctl"]
            and (not keras_utils.is_v2_0() or strategy is None)):
        logging.error(
            "Custom training loop only works with tensorflow 2.0 and dist strat."
        )
        return

    # ncf_common rounds eval_batch_size (this is needed due to a reshape during
    # eval). This carries over that rounding to batch_size as well. This is the
    # per device batch size
    params["batch_size"] = params["eval_batch_size"]
    batch_size = params["batch_size"]

    num_users, num_items, num_train_steps, num_eval_steps, producer = (
        ncf_common.get_inputs(params))

    params["num_users"], params["num_items"] = num_users, num_items
    producer.start()
    model_helpers.apply_clean(flags.FLAGS)

    batches_per_step = params["batches_per_step"]
    train_input_dataset, eval_input_dataset = _get_train_and_eval_data(
        producer, params)
    # It is required that for distributed training, the dataset must call
    # batch(). The parameter of batch() here is the number of replicas involed,
    # such that each replica evenly gets a slice of data.
    # drop_remainder = True, as we would like batch call to return a fixed shape
    # vs None, this prevents a expensive broadcast during weighted_loss
    train_input_dataset = train_input_dataset.batch(batches_per_step,
                                                    drop_remainder=True)
    eval_input_dataset = eval_input_dataset.batch(batches_per_step,
                                                  drop_remainder=True)

    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    per_epoch_callback = IncrementEpochCallback(producer)
    callbacks = [per_epoch_callback, time_callback]

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)

    with distribution_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["learning_rate"],
            beta_1=params["beta1"],
            beta_2=params["beta2"],
            epsilon=params["epsilon"])

    if params["keras_use_ctl"]:
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
        train_input_iterator = strategy.make_dataset_iterator(
            train_input_dataset)
        eval_input_iterator = strategy.make_dataset_iterator(
            eval_input_dataset)

        @tf.function
        def train_step():
            """Called once per step to train the model."""
            def step_fn(features):
                """Computes loss and applied gradient per replica."""
                with tf.GradientTape() as tape:
                    softmax_logits = keras_model(features)
                    labels = features[rconst.TRAIN_LABEL_KEY]
                    loss = loss_object(
                        labels,
                        softmax_logits,
                        sample_weight=features[rconst.VALID_POINT_MASK])
                    loss *= (1.0 /
                             (batch_size * strategy.num_replicas_in_sync))

                grads = tape.gradient(loss, keras_model.trainable_variables)
                # Converting gradients to dense form helps in perf on GPU for NCF
                grads = neumf_model.sparse_to_dense_grads(
                    list(zip(grads, keras_model.trainable_variables)))
                optimizer.apply_gradients(grads)
                return loss

            per_replica_losses = strategy.experimental_run(
                step_fn, train_input_iterator)
            mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                        per_replica_losses,
                                        axis=None)
            return mean_loss

        @tf.function
        def eval_step():
            """Called once per eval step to compute eval metrics."""
            def step_fn(features):
                """Computes eval metrics per replica."""
                softmax_logits = keras_model(features)
                in_top_k, metric_weights = metric_fn(
                    softmax_logits, features[rconst.DUPLICATE_MASK], params)
                hr_sum = tf.reduce_sum(in_top_k * metric_weights)
                hr_count = tf.reduce_sum(metric_weights)
                return hr_sum, hr_count

            per_replica_hr_sum, per_replica_hr_count = (
                strategy.experimental_run(step_fn, eval_input_iterator))
            hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                     per_replica_hr_sum,
                                     axis=None)
            hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_hr_count,
                                       axis=None)
            return hr_sum, hr_count

        time_callback.on_train_begin()
        for epoch in range(FLAGS.train_epochs):
            per_epoch_callback.on_epoch_begin(epoch)
            train_input_iterator.initialize()
            train_loss = 0
            for step in range(num_train_steps):
                time_callback.on_batch_begin(step + epoch * num_train_steps)
                train_loss += train_step()
                time_callback.on_batch_end(step + epoch * num_train_steps)
            train_loss /= num_train_steps
            logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1,
                         train_loss)
            eval_input_iterator.initialize()
            hr_sum = 0
            hr_count = 0
            for _ in range(num_eval_steps):
                step_hr_sum, step_hr_count = eval_step()
                hr_sum += step_hr_sum
                hr_count += step_hr_count
            logging.info("Done eval epoch %s, hr=%s.", epoch + 1,
                         hr_sum / hr_count)

            if (FLAGS.early_stopping
                    and float(hr_sum / hr_count) > params["hr_threshold"]):
                break

        time_callback.on_train_end()
        eval_results = [None, hr_sum / hr_count]

    else:
        with distribution_utils.get_strategy_scope(strategy):

            keras_model.compile(optimizer=optimizer,
                                run_eagerly=FLAGS.run_eagerly)

            history = keras_model.fit(train_input_dataset,
                                      epochs=FLAGS.train_epochs,
                                      callbacks=callbacks,
                                      validation_data=eval_input_dataset,
                                      validation_steps=num_eval_steps,
                                      verbose=2)

            logging.info("Training done. Start evaluating")

            eval_results = keras_model.evaluate(eval_input_dataset,
                                                steps=num_eval_steps,
                                                verbose=2)

            logging.info("Keras evaluation is done.")

        if history and history.history:
            train_history = history.history
            train_loss = train_history["loss"][-1]

    stats = build_stats(train_loss, eval_results, time_callback)
    return stats
コード例 #20
0
 def setUp(self):
   super(CtlImagenetTest, self).setUp()
   if not keras_utils.is_v2_0():
     tf.compat.v1.enable_v2_behavior()
   imagenet_preprocessing.NUM_IMAGES['validation'] = 4
class BaseTest(tf.test.TestCase):
    """Tests for Wide Deep model."""
    @classmethod
    def setUpClass(cls):  # pylint: disable=invalid-name
        super(BaseTest, cls).setUpClass()
        train_higgs.define_train_higgs_flags()

    def setUp(self):
        # Create temporary CSV file
        self.data_dir = self.get_temp_dir()
        data = pd.read_csv(TEST_CSV,
                           dtype=np.float32,
                           names=["c%02d" % i for i in range(29)]).as_matrix()
        self.input_npz = os.path.join(self.data_dir, train_higgs.NPZ_FILE)
        # numpy.savez doesn't take gfile.Gfile, so need to write down and copy.
        tmpfile = tempfile.NamedTemporaryFile()
        np.savez_compressed(tmpfile, data=data)
        tf.io.gfile.copy(tmpfile.name, self.input_npz)

    @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
    def test_read_higgs_data(self):
        """Tests read_higgs_data() function."""
        # Error when a wrong data_dir is given.
        with self.assertRaisesRegexp(RuntimeError, "Error loading data.*"):
            train_data, eval_data = train_higgs.read_higgs_data(
                self.data_dir + "non-existing-path",
                train_start=0,
                train_count=15,
                eval_start=15,
                eval_count=5)

        # Loading fine with the correct data_dir.
        train_data, eval_data = train_higgs.read_higgs_data(self.data_dir,
                                                            train_start=0,
                                                            train_count=15,
                                                            eval_start=15,
                                                            eval_count=5)
        self.assertEqual((15, 29), train_data.shape)
        self.assertEqual((5, 29), eval_data.shape)

    @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
    def test_make_inputs_from_np_arrays(self):
        """Tests make_inputs_from_np_arrays() function."""
        train_data, _ = train_higgs.read_higgs_data(self.data_dir,
                                                    train_start=0,
                                                    train_count=15,
                                                    eval_start=15,
                                                    eval_count=5)
        (input_fn, feature_names,
         feature_columns) = train_higgs.make_inputs_from_np_arrays(
             features_np=train_data[:, 1:], label_np=train_data[:, 0:1])

        # Check feature_names.
        self.assertAllEqual(feature_names,
                            ["feature_%02d" % (i + 1) for i in range(28)])

        # Check feature columns.
        self.assertEqual(28, len(feature_columns))
        bucketized_column_type = type(
            tf.feature_column.bucketized_column(
                tf.feature_column.numeric_column("feature_01"),
                boundaries=[0, 1, 2]))  # dummy boundaries.
        for feature_column in feature_columns:
            self.assertIsInstance(feature_column, bucketized_column_type)
            # At least 2 boundaries.
            self.assertGreaterEqual(len(feature_column.boundaries), 2)
        # Tests that the source column names of the bucketized columns match.
        self.assertAllEqual(
            feature_names, [col.source_column.name for col in feature_columns])

        # Check features.
        features, labels = input_fn().make_one_shot_iterator().get_next()
        with tf.Session() as sess:
            features, labels = sess.run((features, labels))
        self.assertIsInstance(features, dict)
        self.assertAllEqual(feature_names, sorted(features.keys()))
        self.assertAllEqual([[15, 1]] * 28,
                            [features[name].shape for name in feature_names])
        # Validate actual values of some features.
        self.assertAllClose([
            0.869293, 0.907542, 0.798834, 1.344384, 1.105009, 1.595839,
            0.409391, 0.933895, 1.405143, 1.176565, 0.945974, 0.739356,
            1.384097, 1.383548, 1.343652
        ], np.squeeze(features[feature_names[0]], 1))
        self.assertAllClose([
            -0.653674, -0.213641, 1.540659, -0.676015, 1.020974, 0.643109,
            -1.038338, -2.653732, 0.567342, 0.534315, 0.720819, -0.481741,
            1.409523, -0.307865, 1.474605
        ], np.squeeze(features[feature_names[10]], 1))

    @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
    def test_end_to_end(self):
        """Tests end-to-end running."""
        model_dir = os.path.join(self.get_temp_dir(), "model")
        integration.run_synthetic(main=train_higgs.main,
                                  tmp_root=self.get_temp_dir(),
                                  extra_flags=[
                                      "--data_dir",
                                      self.data_dir,
                                      "--model_dir",
                                      model_dir,
                                      "--n_trees",
                                      "5",
                                      "--train_start",
                                      "0",
                                      "--train_count",
                                      "12",
                                      "--eval_start",
                                      "12",
                                      "--eval_count",
                                      "8",
                                  ],
                                  synth=False,
                                  train_epochs=None,
                                  epochs_between_evals=None)
        self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))

    @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
    def test_end_to_end_with_export(self):
        """Tests end-to-end running."""
        model_dir = os.path.join(self.get_temp_dir(), "model")
        export_dir = os.path.join(self.get_temp_dir(), "export")
        integration.run_synthetic(main=train_higgs.main,
                                  tmp_root=self.get_temp_dir(),
                                  extra_flags=[
                                      "--data_dir",
                                      self.data_dir,
                                      "--model_dir",
                                      model_dir,
                                      "--export_dir",
                                      export_dir,
                                      "--n_trees",
                                      "5",
                                      "--train_start",
                                      "0",
                                      "--train_count",
                                      "12",
                                      "--eval_start",
                                      "12",
                                      "--eval_count",
                                      "8",
                                  ],
                                  synth=False,
                                  train_epochs=None,
                                  epochs_between_evals=None)
        self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))
        self.assertTrue(tf.gfile.Exists(os.path.join(export_dir)))
コード例 #22
0
ファイル: ncf_keras_main.py プロジェクト: znznznznzn/models
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    # TODO(seemuch): Support different train and eval batch sizes
    if FLAGS.eval_batch_size != FLAGS.batch_size:
        logging.warning(
            "The Keras implementation of NCF currently does not support batch_size "
            "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
            "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size))
        FLAGS.eval_batch_size = FLAGS.batch_size

    params = ncf_common.parse_flags(FLAGS)
    model_helpers.apply_clean(flags.FLAGS)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus)
    params["distribute_strategy"] = strategy

    if not keras_utils.is_v2_0() and strategy is not None:
        logging.error(
            "NCF Keras only works with distribution strategy in TF 2.0")
        return

    if (params["keras_use_ctl"]
            and (not keras_utils.is_v2_0() or strategy is None)):
        logging.error(
            "Custom training loop only works with tensorflow 2.0 and dist strat."
        )
        return

    # ncf_common rounds eval_batch_size (this is needed due to a reshape during
    # eval). This carries over that rounding to batch_size as well. This is the
    # per device batch size
    params["batch_size"] = params["eval_batch_size"]
    batch_size = params["batch_size"]

    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        # Start data producing thread.
        num_users, num_items, num_train_steps, num_eval_steps, producer = (
            ncf_common.get_inputs(params))
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items
    (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \
        (ncf_input_pipeline.create_ncf_input_data(
            params, producer, input_meta_data))
    steps_per_epoch = None if generate_input_online else num_train_steps

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)
    with distribution_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["learning_rate"],
            beta_1=params["beta1"],
            beta_2=params["beta2"],
            epsilon=params["epsilon"])

    if params["keras_use_ctl"]:
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            reduction="sum", from_logits=True)
        train_input_iterator = strategy.make_dataset_iterator(
            train_input_dataset)
        eval_input_iterator = strategy.make_dataset_iterator(
            eval_input_dataset)

        @tf.function
        def train_step():
            """Called once per step to train the model."""
            def step_fn(features):
                """Computes loss and applied gradient per replica."""
                with tf.GradientTape() as tape:
                    softmax_logits = keras_model(features)
                    labels = features[rconst.TRAIN_LABEL_KEY]
                    loss = loss_object(
                        labels,
                        softmax_logits,
                        sample_weight=features[rconst.VALID_POINT_MASK])
                    loss *= (1.0 /
                             (batch_size * strategy.num_replicas_in_sync))

                grads = tape.gradient(loss, keras_model.trainable_variables)
                # Converting gradients to dense form helps in perf on GPU for NCF
                grads = neumf_model.sparse_to_dense_grads(
                    list(zip(grads, keras_model.trainable_variables)))
                optimizer.apply_gradients(grads)
                return loss

            per_replica_losses = strategy.experimental_run(
                step_fn, train_input_iterator)
            mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                        per_replica_losses,
                                        axis=None)
            return mean_loss

        @tf.function
        def eval_step():
            """Called once per eval step to compute eval metrics."""
            def step_fn(features):
                """Computes eval metrics per replica."""
                softmax_logits = keras_model(features)
                in_top_k, metric_weights = metric_fn(
                    softmax_logits, features[rconst.DUPLICATE_MASK], params)
                hr_sum = tf.reduce_sum(in_top_k * metric_weights)
                hr_count = tf.reduce_sum(metric_weights)
                return hr_sum, hr_count

            per_replica_hr_sum, per_replica_hr_count = (
                strategy.experimental_run(step_fn, eval_input_iterator))
            hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                     per_replica_hr_sum,
                                     axis=None)
            hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_hr_count,
                                       axis=None)
            return hr_sum, hr_count

        time_callback.on_train_begin()
        for epoch in range(FLAGS.train_epochs):
            for cb in callbacks:
                cb.on_epoch_begin(epoch)

            # As NCF dataset is sampled with randomness, not repeating
            # data elements in each epoch has significant impact on
            # convergence. As so, offline-generated TF record files
            # contains all epoch worth of data. Thus we do not need
            # to initialize dataset when reading from tf record files.
            if generate_input_online:
                train_input_iterator.initialize()

            train_loss = 0
            for step in range(num_train_steps):
                time_callback.on_batch_begin(step + epoch * num_train_steps)
                train_loss += train_step()
                time_callback.on_batch_end(step + epoch * num_train_steps)
            train_loss /= num_train_steps
            logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1,
                         train_loss)
            eval_input_iterator.initialize()
            hr_sum = 0
            hr_count = 0
            for _ in range(num_eval_steps):
                step_hr_sum, step_hr_count = eval_step()
                hr_sum += step_hr_sum
                hr_count += step_hr_count
            logging.info("Done eval epoch %s, hr=%s.", epoch + 1,
                         hr_sum / hr_count)

            if (FLAGS.early_stopping
                    and float(hr_sum / hr_count) > params["hr_threshold"]):
                break

        time_callback.on_train_end()
        eval_results = [None, hr_sum / hr_count]

    else:
        with distribution_utils.get_strategy_scope(strategy):

            keras_model.compile(
                optimizer=optimizer,
                run_eagerly=FLAGS.run_eagerly,
                run_distributed=FLAGS.force_v2_in_keras_compile)

            history = keras_model.fit(train_input_dataset,
                                      epochs=FLAGS.train_epochs,
                                      steps_per_epoch=steps_per_epoch,
                                      callbacks=callbacks,
                                      validation_data=eval_input_dataset,
                                      validation_steps=num_eval_steps,
                                      verbose=2)

            logging.info("Training done. Start evaluating")

            eval_results = keras_model.evaluate(eval_input_dataset,
                                                steps=num_eval_steps,
                                                verbose=2)

            logging.info("Keras evaluation is done.")

        if history and history.history:
            train_history = history.history
            train_loss = train_history["loss"][-1]

    stats = build_stats(train_loss, eval_results, time_callback)
    return stats
コード例 #23
0
class TransformerTaskTest(tf.test.TestCase):
    local_flags = None

    def setUp(self):
        temp_dir = self.get_temp_dir()
        if TransformerTaskTest.local_flags is None:
            misc.define_transformer_flags()
            # Loads flags, array cannot be blank.
            flags.FLAGS(['foo'])
            TransformerTaskTest.local_flags = flagsaver.save_flag_values()
        else:
            flagsaver.restore_flag_values(TransformerTaskTest.local_flags)
        FLAGS.model_dir = os.path.join(temp_dir, FIXED_TIMESTAMP)
        FLAGS.param_set = 'tiny'
        FLAGS.use_synthetic_data = True
        FLAGS.steps_between_evals = 1
        FLAGS.train_steps = 2
        FLAGS.validation_steps = 1
        FLAGS.batch_size = 8
        FLAGS.num_gpus = 1
        FLAGS.distribution_strategy = 'off'
        FLAGS.dtype = 'fp32'
        self.model_dir = FLAGS.model_dir
        self.temp_dir = temp_dir
        self.vocab_file = os.path.join(temp_dir, 'vocab')
        self.vocab_size = misc.get_model_params(FLAGS.param_set,
                                                0)['vocab_size']
        self.bleu_source = os.path.join(temp_dir, 'bleu_source')
        self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
        self.orig_policy = tf.keras.mixed_precision.experimental.global_policy(
        )

    def tearDown(self):
        tf.keras.mixed_precision.experimental.set_policy(self.orig_policy)

    def _assert_exists(self, filepath):
        self.assertTrue(os.path.exists(filepath))

    def test_train(self):
        t = tm.TransformerTask(FLAGS)
        t.train()

    @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
    def test_train_static_batch(self):
        FLAGS.static_batch = True
        t = tm.TransformerTask(FLAGS)
        t.train()

    @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
    @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
    def test_train_1_gpu_with_dist_strat(self):
        FLAGS.distribution_strategy = 'one_device'
        t = tm.TransformerTask(FLAGS)
        t.train()

    @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
    @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
    def test_train_2_gpu(self):
        if context.num_gpus() < 2:
            self.skipTest(
                '{} GPUs are not available for this test. {} GPUs are available'
                .format(2, context.num_gpus()))
        FLAGS.distribution_strategy = 'mirrored'
        FLAGS.num_gpus = 2
        FLAGS.param_set = 'base'
        t = tm.TransformerTask(FLAGS)
        t.train()

    @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
    @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
    def test_train_2_gpu_fp16(self):
        if context.num_gpus() < 2:
            self.skipTest(
                '{} GPUs are not available for this test. {} GPUs are available'
                .format(2, context.num_gpus()))
        FLAGS.distribution_strategy = 'mirrored'
        FLAGS.num_gpus = 2
        FLAGS.param_set = 'base'
        FLAGS.dtype = 'fp16'
        t = tm.TransformerTask(FLAGS)
        t.train()

    def _prepare_files_and_flags(self, *extra_flags):
        # Make log dir.
        if not os.path.exists(self.temp_dir):
            os.makedirs(self.temp_dir)

        # Fake vocab, bleu_source and bleu_ref.
        tokens = [
            "'<pad>'", "'<EOS>'", "'_'", "'a'", "'b'", "'c'", "'d'", "'a_'",
            "'b_'", "'c_'", "'d_'"
        ]
        tokens += [
            "'{}'".format(i) for i in range(self.vocab_size - len(tokens))
        ]
        _generate_file(self.vocab_file, tokens)
        _generate_file(self.bleu_source, ['a b', 'c d'])
        _generate_file(self.bleu_ref, ['a b', 'd c'])

        # Update flags.
        update_flags = [
            'ignored_program_name',
            '--vocab_file={}'.format(self.vocab_file),
            '--bleu_source={}'.format(self.bleu_source),
            '--bleu_ref={}'.format(self.bleu_ref),
        ]
        if extra_flags:
            update_flags.extend(extra_flags)
        FLAGS(update_flags)

    def test_predict(self):
        self._prepare_files_and_flags()
        t = tm.TransformerTask(FLAGS)
        t.predict()

    def test_predict_fp16(self):
        self._prepare_files_and_flags('--dtype=fp16')
        t = tm.TransformerTask(FLAGS)
        t.predict()

    def test_eval(self):
        self._prepare_files_and_flags()
        t = tm.TransformerTask(FLAGS)
        t.eval()
コード例 #24
0
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using native Keras APIs.

    Args:
        flags_obj: An object containing parsed flag values.

    Raises:
        ValueError: If fp16 is passed as it is not currently supported.

    Returns:
        Dictionary of training and eval stats.
    """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    # Execute flag override logic for better model performance
    if flags_obj.tf_gpu_thread_mode:
        keras_utils.set_gpu_thread_mode_and_count(
            per_gpu_thread_count=flags_obj.per_gpu_thread_count,
            gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
            num_gpus=flags_obj.num_gpus,
            datasets_num_private_threads=flags_obj.datasets_num_private_threads
        )
    common.set_cudnn_batchnorm_mode()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == tf.float16:
        loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale=loss_scale)
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
        if not keras_utils.is_v2_0():
            raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.')
    elif dtype == tf.bfloat16:
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_bfloat16')
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    preprocessing_seed = 12345

    # pylint: disable=protected-access
    if flags_obj.use_synthetic_data:
        distribution_utils.set_up_synthetic_data()
        input_fn = common.get_synth_input_fn(
            height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
            width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
            num_channels=imagenet_preprocessing.NUM_CHANNELS,
            num_classes=imagenet_preprocessing.NUM_CLASSES,
            dtype=dtype,
            drop_remainder=True)
    else:
        distribution_utils.undo_set_up_synthetic_data()
        input_fn = imagenet_preprocessing.input_fn

    # When `enable_xla` is True, we always drop the remainder of the batches
    # in the dataset, as XLA-GPU doesn't support dynamic shapes.
    drop_remainder = flags_obj.enable_xla

    train_input_dataset = input_fn(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
        num_epochs=flags_obj.train_epochs,
        parse_record_fn=imagenet_preprocessing.parse_record,
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        dtype=dtype,
        drop_remainder=drop_remainder,
        random_seed=preprocessing_seed,  #addition
        num_workers=current_cluster_size(),  #addition
        worker_ID=current_rank(),  #addition
        tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
        training_dataset_cache=flags_obj.training_dataset_cache,
    )

    eval_input_dataset = None
    if not flags_obj.skip_eval:
        eval_input_dataset = input_fn(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=flags_obj.batch_size,
            num_epochs=flags_obj.train_epochs,
            parse_record_fn=imagenet_preprocessing.parse_record,
            dtype=dtype,
            drop_remainder=drop_remainder)

    lr_schedule = 0.1
    if flags_obj.use_tensor_lr:
        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)

    # Build KungFu optimizer
    opt = common.get_optimizer(lr_schedule)
    # logging.info(opt.__dict__)
    optimizer = SynchronousSGDOptimizer(opt, reshape=False, use_locking=True)
    optimizer._hyper = opt._hyper
    # logging.info(optimizer.__dict__)

    if flags_obj.fp16_implementation == 'graph_rewrite':
        # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
        # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
        # which will ensure tf.compat.v2.keras.mixed_precision and
        # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
        # up.
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer)

    # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
    if flags_obj.use_trivial_model:
        model = trivial_model.trivial_model(imagenet_preprocessing.NUM_CLASSES)
    else:
        model = resnet_model.resnet50(
            num_classes=imagenet_preprocessing.NUM_CLASSES)

    # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
    # a valid arg for this model. Also remove as a valid flag.

    metrics = (['sparse_categorical_accuracy'])
    metrics.append('sparse_top_k_categorical_accuracy')

    if flags_obj.force_v2_in_keras_compile is not None:
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=optimizer,
            metrics=metrics,
            run_eagerly=flags_obj.run_eagerly,
            experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)
    else:
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=metrics,
                      run_eagerly=flags_obj.run_eagerly)

    # adjust number of steps
    cluster_size = current_cluster_size()
    steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] //
                       flags_obj.batch_size)
    steps_per_epoch = steps_per_epoch // cluster_size

    train_epochs = flags_obj.train_epochs
    callbacks = common.get_callbacks(steps_per_epoch, current_rank(),
                                     cluster_size,
                                     common.learning_rate_schedule)

    # Broadcast variables for KungFu
    callbacks.append(BroadcastGlobalVariablesCallback())

    # Checkpoint callback only on worker 0
    if flags_obj.enable_checkpoint_and_export and current_rank() == 0:
        ckpt_full_path = os.path.join(flags_obj.model_dir,
                                      'model.ckpt-{epoch:04d}')
        callbacks.append(
            tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
                                               save_weights_only=True))

    if flags_obj.train_steps:
        steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)

    num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        # Only build the training graph. This reduces memory usage introduced by
        # control flow ops in layers that have different implementations for
        # training and inference (e.g., batch norm).
        if flags_obj.set_learning_phase_to_train:
            # TODO(haoyuzhang): Understand slowdown of setting learning phase when
            # not using distribution strategy.
            tf.keras.backend.set_learning_phase(1)
            num_eval_steps = None
            validation_data = None

    history = model.fit(train_input_dataset,
                        epochs=train_epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        validation_steps=num_eval_steps,
                        validation_data=validation_data,
                        validation_freq=flags_obj.epochs_between_evals,
                        verbose=2)

    # Checkpoint only on 0th worker
    if flags_obj.enable_checkpoint_and_export and current_rank() == 0:
        if dtype == tf.bfloat16:
            logging.warning(
                "Keras model.save does not support bfloat16 dtype.")
        else:
            # Keras model.save assumes a float32 input designature.
            export_path = os.path.join(flags_obj.model_dir, 'saved_model')
            model.save(export_path, include_optimizer=False)

    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=2)

    stats = common.build_stats(history, eval_output, callbacks)
    return stats