def testActionShape(self, observation_shape, batch_size, weights, inverse_temperature): observation_spec = tensor_spec.TensorSpec(shape=observation_shape, dtype=tf.float32, name='observation_spec') time_step_spec = time_step.time_step_spec(observation_spec) weights = tf.compat.v2.Variable(weights, dtype=tf.float32) inverse_temperature = tf.compat.v2.Variable(inverse_temperature, dtype=tf.float32) action_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.int32, minimum=0, maximum=tf.compat.dimension_value(weights.shape[0]) - 1, name='action') policy = categorical_policy.CategoricalPolicy(weights, time_step_spec, action_spec, inverse_temperature) observation_step = _get_dummy_observation_step(observation_shape, batch_size) action_time_step = policy.action(observation_step) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(action_time_step.action.shape.as_list(), [batch_size])
def __init__(self, time_step_spec, action_spec, learning_rate, name=None): """Initialize an instance of `Exp3Agent`. Args: time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. learning_rate: A float valued scalar. A higher value will force the agent to converge on a single action more quickly. A lower value will encourage more exploration. This value corresponds to the `inverse_temperature` argument passed to `CategoricalPolicy`. name: a name for this instance of `Exp3Agent`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._weights = tf.compat.v2.Variable(tf.zeros(self._num_actions), name='weights') self._learning_rate = tf.compat.v2.Variable(learning_rate, name='learning_rate') policy = categorical_policy.CategoricalPolicy( weights=self._weights, time_step_spec=time_step_spec, action_spec=action_spec, inverse_temperature=self._learning_rate) # TODO(b/127462472): consider policy=GreedyPolicy(collect_policy). super(Exp3Agent, self).__init__(time_step_spec=time_step_spec, action_spec=policy.action_spec, policy=policy, collect_policy=policy, train_sequence_length=None)
def testActionProbabilities(self, observation_shape, batch_size, weights, inverse_temperature, seed): observation_spec = tensor_spec.TensorSpec(shape=observation_shape, dtype=tf.float32, name='observation_spec') time_step_spec = time_step.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.int32, minimum=0, maximum=tf.compat.dimension_value(weights.shape[0]) - 1, name='action') policy = categorical_policy.CategoricalPolicy(weights, time_step_spec, action_spec, inverse_temperature) observation_step = _get_dummy_observation_step(observation_shape, batch_size) action_time_step = policy.action(observation_step, seed=seed) logits = inverse_temperature * weights z = tf.reduce_logsumexp(logits) expected_logprob = logits - z expected_action_prob = tf.exp( tf.gather(expected_logprob, action_time_step.action)) actual_action_prob = tf.exp( policy_step.get_log_probability(action_time_step.info)) expected_action_prob_val, actual_action_prob_val = self.evaluate( [expected_action_prob, actual_action_prob]) self.assertAllClose(expected_action_prob_val, actual_action_prob_val)
def testInverseTempUpdate(self, observation_shape, weights, seed): """Test that temperature updates perform as expected as it is increased.""" observation_spec = tensor_spec.TensorSpec( shape=observation_shape, dtype=tf.float32, name='observation_spec') time_step_spec = time_step.time_step_spec(observation_spec) weight_var = tf.compat.v2.Variable(weights, dtype=tf.float32) inverse_temperature_var = tf.compat.v2.Variable( TEMP_UPDATE_TEST_INITIAL_INVERSE_TEMP, dtype=tf.float32) action_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.int64, minimum=0, maximum=tf.compat.dimension_value(weight_var.shape[0]) - 1, name='action') policy = categorical_policy.CategoricalPolicy(weight_var, time_step_spec, action_spec, inverse_temperature_var) observation_step = _get_dummy_observation_step(observation_shape, TEMP_UPDATE_TEST_BATCH_SIZE) tf.compat.v1.set_random_seed(seed) self.evaluate(tf.compat.v1.global_variables_initializer()) # Set the inverse temperature to a large value. self.evaluate( tf.compat.v1.assign(inverse_temperature_var, TEMP_UPDATE_TEST_FINAL_INVERSE_TEMP)) final_action_time_step = self.evaluate( policy.action(observation_step, seed=seed)) self.assertAllEqual( final_action_time_step.action, np.full([TEMP_UPDATE_TEST_BATCH_SIZE], np.argmax(weights)))