Example #1
0
def construct_estimator(model_dir, params):
    """Construct either an Estimator or TPUEstimator for NCF.

  Args:
    model_dir: The model directory for the estimator
    params: The params dict for the estimator

  Returns:
    An Estimator or TPUEstimator.
  """
    distribution = ncf_common.get_distribution_strategy(params)
    run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                        eval_distribute=distribution)

    model_fn = neumf_model.neumf_model_fn
    if params["use_xla_for_gpu"]:
        # TODO(seemuch): remove the contrib imput
        from tensorflow.contrib.compiler import xla
        LOGGING.info("Using XLA for GPU for training and evaluation.")
        model_fn = xla.estimator_model_fn(model_fn)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=model_dir,
                                       config=run_config,
                                       params=params)
    return estimator
Example #2
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""
    # TODO(seemuch): Support different train and eval batch sizes
    if FLAGS.eval_batch_size != FLAGS.batch_size:
        tf.logging.warning(
            "The Keras implementation of NCF currently does not support batch_size "
            "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
            "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size))
        FLAGS.eval_batch_size = FLAGS.batch_size

    params = ncf_common.parse_flags(FLAGS)
    batch_size = params["batch_size"]

    # ncf_common rounds eval_batch_size (this is needed due to a reshape during
    # eval). This carries over that rounding to batch_size as well.
    params['batch_size'] = params['eval_batch_size']

    num_users, num_items, num_train_steps, num_eval_steps, producer = (
        ncf_common.get_inputs(params))

    params["num_users"], params["num_items"] = num_users, num_items
    producer.start()
    model_helpers.apply_clean(flags.FLAGS)

    batches_per_step = params["batches_per_step"]
    train_input_dataset, eval_input_dataset = _get_train_and_eval_data(
        producer, params)
    # It is required that for distributed training, the dataset must call
    # batch(). The parameter of batch() here is the number of replicas involed,
    # such that each replica evenly gets a slice of data.
    train_input_dataset = train_input_dataset.batch(batches_per_step)
    eval_input_dataset = eval_input_dataset.batch(batches_per_step)

    strategy = ncf_common.get_distribution_strategy(params)
    with distribution_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = ncf_common.get_optimizer(params)
        time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)

        keras_model.compile(loss=_keras_loss,
                            metrics=[_get_metric_fn(params)],
                            optimizer=optimizer)

        history = keras_model.fit(
            train_input_dataset,
            epochs=FLAGS.train_epochs,
            callbacks=[IncrementEpochCallback(producer), time_callback],
            verbose=2)

        tf.logging.info("Training done. Start evaluating")

        eval_results = keras_model.evaluate(eval_input_dataset,
                                            steps=num_eval_steps,
                                            verbose=2)

    tf.logging.info("Keras evaluation is done.")

    stats = build_stats(history, eval_results, time_callback)
    return stats
Example #3
0
def construct_estimator(model_dir, params):
  """Construct either an Estimator or TPUEstimator for NCF.

  Args:
    model_dir: The model directory for the estimator
    params: The params dict for the estimator

  Returns:
    An Estimator or TPUEstimator.
  """
  distribution = ncf_common.get_distribution_strategy(params)
  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      eval_distribute=distribution)

  model_fn = neumf_model.neumf_model_fn
  if params["use_xla_for_gpu"]:
    tf.logging.info("Using XLA for GPU for training and evaluation.")
    model_fn = xla.estimator_model_fn(model_fn)
  estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir,
                                     config=run_config, params=params)
  return estimator
def construct_estimator(model_dir, params):
    """Construct either an Estimator or TPUEstimator for NCF.

  Args:
    model_dir: The model directory for the estimator
    params: The params dict for the estimator

  Returns:
    An Estimator or TPUEstimator.
  """
    distribution = ncf_common.get_distribution_strategy(params)
    run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                        eval_distribute=distribution)

    model_fn = neumf_model.neumf_model_fn
    if params["use_xla_for_gpu"]:
        tf.logging.info("Using XLA for GPU for training and evaluation.")
        model_fn = xla.estimator_model_fn(model_fn)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=model_dir,
                                       config=run_config,
                                       params=params)
    return estimator
Example #5
0
def run_ncf(_):
  """Run NCF training and eval with Keras."""
  # TODO(seemuch): Support different train and eval batch sizes
  if FLAGS.eval_batch_size != FLAGS.batch_size:
    logging.warning(
        "The Keras implementation of NCF currently does not support batch_size "
        "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
        "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)
        )
    FLAGS.eval_batch_size = FLAGS.batch_size

  params = ncf_common.parse_flags(FLAGS)

  if params["keras_use_ctl"] and int(tf.__version__.split(".")[0]) == 1:
    logging.error(
        "Custom training loop only works with tensorflow 2.0 and above.")
    return

  # ncf_common rounds eval_batch_size (this is needed due to a reshape during
  # eval). This carries over that rounding to batch_size as well. This is the
  # per device batch size
  params["batch_size"] = params["eval_batch_size"]
  batch_size = params["batch_size"]

  num_users, num_items, num_train_steps, num_eval_steps, producer = (
      ncf_common.get_inputs(params))

  params["num_users"], params["num_items"] = num_users, num_items
  producer.start()
  model_helpers.apply_clean(flags.FLAGS)

  batches_per_step = params["batches_per_step"]
  train_input_dataset, eval_input_dataset = _get_train_and_eval_data(producer,
                                                                     params)
  # It is required that for distributed training, the dataset must call
  # batch(). The parameter of batch() here is the number of replicas involed,
  # such that each replica evenly gets a slice of data.
  train_input_dataset = train_input_dataset.batch(batches_per_step)
  eval_input_dataset = eval_input_dataset.batch(batches_per_step)

  time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
  per_epoch_callback = IncrementEpochCallback(producer)
  callbacks = [per_epoch_callback, time_callback]

  if FLAGS.early_stopping:
    early_stopping_callback = CustomEarlyStopping(
        "val_metric_fn", desired_value=FLAGS.hr_threshold)
    callbacks.append(early_stopping_callback)

  strategy = ncf_common.get_distribution_strategy(params)
  with distribution_utils.get_strategy_scope(strategy):
    keras_model = _get_keras_model(params)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=params["learning_rate"],
        beta_1=params["beta1"],
        beta_2=params["beta2"],
        epsilon=params["epsilon"])

  if params["keras_use_ctl"]:
    loss_object = tf.losses.SparseCategoricalCrossentropy(
        reduction=tf.keras.losses.Reduction.SUM,
        from_logits=True)
    train_input_iterator = strategy.make_dataset_iterator(train_input_dataset)
    eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset)

    @tf.function
    def train_step():
      """Called once per step to train the model."""
      def step_fn(inputs):
        """Computes loss and applied gradient per replica."""
        features, labels = inputs
        with tf.GradientTape() as tape:
          softmax_logits = keras_model(features)
          loss = loss_object(labels, softmax_logits,
                             sample_weight=features[rconst.VALID_POINT_MASK])
          loss *= (1.0 / (batch_size*strategy.num_replicas_in_sync))

        grads = tape.gradient(loss, keras_model.trainable_variables)
        optimizer.apply_gradients(list(zip(grads,
                                           keras_model.trainable_variables)))
        return loss

      per_replica_losses = strategy.experimental_run(step_fn,
                                                     train_input_iterator)
      mean_loss = strategy.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
      return mean_loss

    @tf.function
    def eval_step():
      """Called once per eval step to compute eval metrics."""
      def step_fn(inputs):
        """Computes eval metrics per replica."""
        features, _ = inputs
        softmax_logits = keras_model(features)
        in_top_k, metric_weights = metric_fn(
            softmax_logits, features[rconst.DUPLICATE_MASK], params)
        hr_sum = tf.reduce_sum(in_top_k*metric_weights)
        hr_count = tf.reduce_sum(metric_weights)
        return hr_sum, hr_count

      per_replica_hr_sum, per_replica_hr_count = (
          strategy.experimental_run(step_fn, eval_input_iterator))
      hr_sum = strategy.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None)
      hr_count = strategy.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
      return hr_sum, hr_count

    time_callback.on_train_begin()
    for epoch in range(FLAGS.train_epochs):
      per_epoch_callback.on_epoch_begin(epoch)
      train_input_iterator.initialize()
      train_loss = 0
      for step in range(num_train_steps):
        time_callback.on_batch_begin(step+epoch*num_train_steps)
        train_loss += train_step()
        time_callback.on_batch_end(step+epoch*num_train_steps)
      train_loss /= num_train_steps
      logging.info("Done training epoch %s, epoch loss=%s.",
                   epoch+1, train_loss)
      eval_input_iterator.initialize()
      hr_sum = 0
      hr_count = 0
      for _ in range(num_eval_steps):
        step_hr_sum, step_hr_count = eval_step()
        hr_sum += step_hr_sum
        hr_count += step_hr_count
      logging.info("Done eval epoch %s, hr=%s.", epoch+1, hr_sum/hr_count)

      if (FLAGS.early_stopping and
          float(hr_sum/hr_count) > params["hr_threshold"]):
        break

    time_callback.on_train_end()
    eval_results = [None, hr_sum/hr_count]

  else:
    with distribution_utils.get_strategy_scope(strategy):

      keras_model.compile(optimizer=optimizer)

      history = keras_model.fit(train_input_dataset,
                                steps_per_epoch=num_train_steps,
                                epochs=FLAGS.train_epochs,
                                callbacks=callbacks,
                                validation_data=eval_input_dataset,
                                validation_steps=num_eval_steps,
                                verbose=2)

      logging.info("Training done. Start evaluating")

      eval_results = keras_model.evaluate(
          eval_input_dataset,
          steps=num_eval_steps,
          verbose=2)

      logging.info("Keras evaluation is done.")

    if history and history.history:
      train_history = history.history
      train_loss = train_history["loss"][-1]

  stats = build_stats(train_loss, eval_results, time_callback)
  return stats
Example #6
0
def run_ncf(_):
  """Run NCF training and eval with Keras."""
  # TODO(seemuch): Support different train and eval batch sizes
  if FLAGS.eval_batch_size != FLAGS.batch_size:
    logging.warning(
        "The Keras implementation of NCF currently does not support batch_size "
        "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
        "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)
        )
    FLAGS.eval_batch_size = FLAGS.batch_size

  params = ncf_common.parse_flags(FLAGS)
  batch_size = params["batch_size"]

  # ncf_common rounds eval_batch_size (this is needed due to a reshape during
  # eval). This carries over that rounding to batch_size as well.
  params['batch_size'] = params['eval_batch_size']

  num_users, num_items, num_train_steps, num_eval_steps, producer = (
      ncf_common.get_inputs(params))

  params["num_users"], params["num_items"] = num_users, num_items
  producer.start()
  model_helpers.apply_clean(flags.FLAGS)

  batches_per_step = params["batches_per_step"]
  train_input_dataset, eval_input_dataset = _get_train_and_eval_data(producer,
                                                                     params)
  # It is required that for distributed training, the dataset must call
  # batch(). The parameter of batch() here is the number of replicas involed,
  # such that each replica evenly gets a slice of data.
  train_input_dataset = train_input_dataset.batch(batches_per_step)
  eval_input_dataset = eval_input_dataset.batch(batches_per_step)

  strategy = ncf_common.get_distribution_strategy(params)
  with distribution_utils.get_strategy_scope(strategy):
    keras_model = _get_keras_model(params)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=params["learning_rate"],
        beta_1=params["beta1"],
        beta_2=params["beta2"],
        epsilon=params["epsilon"])
    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)

    keras_model.compile(
        loss=_keras_loss,
        metrics=[_get_metric_fn(params)],
        optimizer=optimizer)

    history = keras_model.fit(train_input_dataset,
                              epochs=FLAGS.train_epochs,
                              callbacks=[
                                  IncrementEpochCallback(producer),
                                  time_callback],
                              verbose=2)

    logging.info("Training done. Start evaluating")

    eval_results = keras_model.evaluate(
        eval_input_dataset,
        steps=num_eval_steps,
        verbose=2)

  logging.info("Keras evaluation is done.")

  stats = build_stats(history, eval_results, time_callback)
  return stats