def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. data_path = FLAGS.data_path if not data_path: raise ValueError('Please specify the location of the data file.') env = movielens_py_environment.MovieLensPyEnvironment( data_path, RANK_K, BATCH_SIZE, num_movies=20) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.compute_optimal_reward_with_movielens_environment, environment=environment) optimal_action_fn = functools.partial( environment_utilities.compute_optimal_action_with_movielens_environment, environment=environment) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), tikhonov_weight=0.001, alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) elif FLAGS.agent == 'DropoutTS': agent = dropout_ts_agent.DropoutThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dropout_rate=DROPOUT_RATE, network_layers=LAYERS, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testCreateAgent(self): agent = dropout_thompson_sampling_agent.DropoutThompsonSamplingAgent( self._time_step_spec, self._action_spec, optimizer=None, dropout_rate=0.1, network_layers=(20, 20, 20)) self.assertIsNotNone(agent.policy)
def testTrainAgent(self): optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = dropout_thompson_sampling_agent.DropoutThompsonSamplingAgent( self._time_step_spec, self._action_spec, optimizer=optimizer, dropout_rate=0.1, network_layers=(20, 20, 20), dropout_only_top_layer=False) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.float32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllGreater(self.evaluate(loss_before), 0) self.assertAllGreater(self.evaluate(loss_after), 0)
def testAgentWithMask(self): optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) obs_spec = (tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32)) agent = dropout_thompson_sampling_agent.DropoutThompsonSamplingAgent( ts.time_step_spec(obs_spec), self._action_spec, optimizer=optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1]), dropout_rate=0.1, network_layers=(20, 20, 20), dropout_only_top_layer=False) actions = np.array([0, 1], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( 2, 2, 3) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllGreater(self.evaluate(loss_before), 0) self.assertAllGreater(self.evaluate(loss_after), 0)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. data_path = FLAGS.data_path if not data_path: raise ValueError('Please specify the location of the data file.') if FLAGS.per_arm: env = movielens_per_arm_py_environment.MovieLensPerArmPyEnvironment( data_path, RANK_K, BATCH_SIZE, num_actions=NUM_ACTIONS, csv_delimiter='\t') else: env = movielens_py_environment.MovieLensPyEnvironment( data_path, RANK_K, BATCH_SIZE, num_movies=NUM_ACTIONS, csv_delimiter='\t') environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.compute_optimal_reward_with_movielens_environment, environment=environment) optimal_action_fn = functools.partial( environment_utilities.compute_optimal_action_with_movielens_environment, environment=environment) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), tikhonov_weight=0.001, alpha=AGENT_ALPHA, dtype=tf.float32, accepts_per_arm_features=FLAGS.per_arm) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dtype=tf.float32, accepts_per_arm_features=FLAGS.per_arm) elif FLAGS.agent == 'epsGreedy': if FLAGS.per_arm: network = ( global_and_arm_feature_network .create_feed_forward_dot_product_network( environment.time_step_spec().observation, global_layers=LAYERS, arm_layers=LAYERS)) else: network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, emit_policy_info='predicted_rewards_mean', info_fields_to_inherit_from_greedy=['predicted_rewards_mean']) elif FLAGS.agent == 'DropoutTS': train_step_counter = tf.compat.v1.train.get_or_create_global_step() def dropout_fn(): return tf.math.maximum( tf.math.reciprocal_no_nan(1.01 + tf.cast(train_step_counter, tf.float32)), 0.0003) agent = dropout_ts_agent.DropoutThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), dropout_rate=dropout_fn, network_layers=LAYERS, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])