def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  data_path = FLAGS.data_path
  if not data_path:
    raise ValueError('Please specify the location of the data file.')
  env = movielens_py_environment.MovieLensPyEnvironment(
      data_path, RANK_K, BATCH_SIZE, num_movies=20)
  environment = tf_py_environment.TFPyEnvironment(env)

  optimal_reward_fn = functools.partial(
      environment_utilities.compute_optimal_reward_with_movielens_environment,
      environment=environment)

  optimal_action_fn = functools.partial(
      environment_utilities.compute_optimal_action_with_movielens_environment,
      environment=environment)

  if FLAGS.agent == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        tikhonov_weight=0.001,
        alpha=AGENT_ALPHA,
        dtype=tf.float32)
  elif FLAGS.agent == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dtype=tf.float32)
  elif FLAGS.agent == 'epsGreedy':
    network = q_network.QNetwork(
        input_tensor_spec=environment.time_step_spec().observation,
        action_spec=environment.action_spec(),
        fc_layer_params=LAYERS)
    agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON)
  elif FLAGS.agent == 'DropoutTS':
    agent = dropout_ts_agent.DropoutThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dropout_rate=DROPOUT_RATE,
        network_layers=LAYERS,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR))

  regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
  suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
      optimal_action_fn)

  trainer.train(
      root_dir=FLAGS.root_dir,
      agent=agent,
      environment=environment,
      training_loops=TRAINING_LOOPS,
      steps_per_loop=STEPS_PER_LOOP,
      additional_metrics=[regret_metric, suboptimal_arms_metric])
 def testCreateAgent(self):
     agent = dropout_thompson_sampling_agent.DropoutThompsonSamplingAgent(
         self._time_step_spec,
         self._action_spec,
         optimizer=None,
         dropout_rate=0.1,
         network_layers=(20, 20, 20))
     self.assertIsNotNone(agent.policy)
 def testTrainAgent(self):
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = dropout_thompson_sampling_agent.DropoutThompsonSamplingAgent(
         self._time_step_spec,
         self._action_spec,
         optimizer=optimizer,
         dropout_rate=0.1,
         network_layers=(20, 20, 20),
         dropout_only_top_layer=False)
     observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
     actions = np.array([0, 1], dtype=np.float32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, rewards)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     loss_before, _ = agent.train(experience, None)
     loss_after, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllGreater(self.evaluate(loss_before), 0)
     self.assertAllGreater(self.evaluate(loss_after), 0)
 def testAgentWithMask(self):
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     obs_spec = (tensor_spec.TensorSpec([2], tf.float32),
                 tensor_spec.TensorSpec([3], tf.int32))
     agent = dropout_thompson_sampling_agent.DropoutThompsonSamplingAgent(
         ts.time_step_spec(obs_spec),
         self._action_spec,
         optimizer=optimizer,
         observation_and_action_constraint_splitter=lambda x: (x[0], x[1]),
         dropout_rate=0.1,
         network_layers=(20, 20, 20),
         dropout_only_top_layer=False)
     actions = np.array([0, 1], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
         2, 2, 3)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     loss_before, _ = agent.train(experience, None)
     loss_after, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllGreater(self.evaluate(loss_before), 0)
     self.assertAllGreater(self.evaluate(loss_after), 0)
Exemple #5
0
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  data_path = FLAGS.data_path
  if not data_path:
    raise ValueError('Please specify the location of the data file.')
  if FLAGS.per_arm:
    env = movielens_per_arm_py_environment.MovieLensPerArmPyEnvironment(
        data_path,
        RANK_K,
        BATCH_SIZE,
        num_actions=NUM_ACTIONS,
        csv_delimiter='\t')
  else:
    env = movielens_py_environment.MovieLensPyEnvironment(
        data_path,
        RANK_K,
        BATCH_SIZE,
        num_movies=NUM_ACTIONS,
        csv_delimiter='\t')
  environment = tf_py_environment.TFPyEnvironment(env)

  optimal_reward_fn = functools.partial(
      environment_utilities.compute_optimal_reward_with_movielens_environment,
      environment=environment)

  optimal_action_fn = functools.partial(
      environment_utilities.compute_optimal_action_with_movielens_environment,
      environment=environment)

  if FLAGS.agent == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        tikhonov_weight=0.001,
        alpha=AGENT_ALPHA,
        dtype=tf.float32,
        accepts_per_arm_features=FLAGS.per_arm)
  elif FLAGS.agent == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dtype=tf.float32,
        accepts_per_arm_features=FLAGS.per_arm)
  elif FLAGS.agent == 'epsGreedy':
    if FLAGS.per_arm:
      network = (
          global_and_arm_feature_network
          .create_feed_forward_dot_product_network(
              environment.time_step_spec().observation,
              global_layers=LAYERS,
              arm_layers=LAYERS))
    else:
      network = q_network.QNetwork(
          input_tensor_spec=environment.time_step_spec().observation,
          action_spec=environment.action_spec(),
          fc_layer_params=LAYERS)
    agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        emit_policy_info='predicted_rewards_mean',
        info_fields_to_inherit_from_greedy=['predicted_rewards_mean'])
  elif FLAGS.agent == 'DropoutTS':
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()

    def dropout_fn():
      return tf.math.maximum(
          tf.math.reciprocal_no_nan(1.01 +
                                    tf.cast(train_step_counter, tf.float32)),
          0.0003)

    agent = dropout_ts_agent.DropoutThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dropout_rate=dropout_fn,
        network_layers=LAYERS,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR))

  regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
  suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
      optimal_action_fn)

  trainer.train(
      root_dir=FLAGS.root_dir,
      agent=agent,
      environment=environment,
      training_loops=TRAINING_LOOPS,
      steps_per_loop=STEPS_PER_LOOP,
      additional_metrics=[regret_metric, suboptimal_arms_metric])