Beispiel #1
0
    def testActionShape(self, observation_shape, batch_size, weights,
                        inverse_temperature):
        observation_spec = tensor_spec.TensorSpec(shape=observation_shape,
                                                  dtype=tf.float32,
                                                  name='observation_spec')
        time_step_spec = time_step.time_step_spec(observation_spec)

        weights = tf.compat.v2.Variable(weights, dtype=tf.float32)
        inverse_temperature = tf.compat.v2.Variable(inverse_temperature,
                                                    dtype=tf.float32)

        action_spec = tensor_spec.BoundedTensorSpec(
            shape=(),
            dtype=tf.int32,
            minimum=0,
            maximum=tf.compat.dimension_value(weights.shape[0]) - 1,
            name='action')

        policy = categorical_policy.CategoricalPolicy(weights, time_step_spec,
                                                      action_spec,
                                                      inverse_temperature)
        observation_step = _get_dummy_observation_step(observation_shape,
                                                       batch_size)
        action_time_step = policy.action(observation_step)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllEqual(action_time_step.action.shape.as_list(),
                            [batch_size])
Beispiel #2
0
    def __init__(self, time_step_spec, action_spec, learning_rate, name=None):
        """Initialize an instance of `Exp3Agent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      learning_rate: A float valued scalar. A higher value will force the agent
        to converge on a single action more quickly. A lower value will
        encourage more exploration. This value corresponds to the
        `inverse_temperature` argument passed to `CategoricalPolicy`.
      name: a name for this instance of `Exp3Agent`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)
        self._weights = tf.compat.v2.Variable(tf.zeros(self._num_actions),
                                              name='weights')
        self._learning_rate = tf.compat.v2.Variable(learning_rate,
                                                    name='learning_rate')
        policy = categorical_policy.CategoricalPolicy(
            weights=self._weights,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            inverse_temperature=self._learning_rate)
        # TODO(b/127462472): consider policy=GreedyPolicy(collect_policy).
        super(Exp3Agent, self).__init__(time_step_spec=time_step_spec,
                                        action_spec=policy.action_spec,
                                        policy=policy,
                                        collect_policy=policy,
                                        train_sequence_length=None)
Beispiel #3
0
    def testActionProbabilities(self, observation_shape, batch_size, weights,
                                inverse_temperature, seed):
        observation_spec = tensor_spec.TensorSpec(shape=observation_shape,
                                                  dtype=tf.float32,
                                                  name='observation_spec')
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(
            shape=(),
            dtype=tf.int32,
            minimum=0,
            maximum=tf.compat.dimension_value(weights.shape[0]) - 1,
            name='action')
        policy = categorical_policy.CategoricalPolicy(weights, time_step_spec,
                                                      action_spec,
                                                      inverse_temperature)
        observation_step = _get_dummy_observation_step(observation_shape,
                                                       batch_size)
        action_time_step = policy.action(observation_step, seed=seed)

        logits = inverse_temperature * weights
        z = tf.reduce_logsumexp(logits)
        expected_logprob = logits - z
        expected_action_prob = tf.exp(
            tf.gather(expected_logprob, action_time_step.action))
        actual_action_prob = tf.exp(
            policy_step.get_log_probability(action_time_step.info))
        expected_action_prob_val, actual_action_prob_val = self.evaluate(
            [expected_action_prob, actual_action_prob])
        self.assertAllClose(expected_action_prob_val, actual_action_prob_val)
  def testInverseTempUpdate(self, observation_shape, weights, seed):
    """Test that temperature updates perform as expected as it is increased."""
    observation_spec = tensor_spec.TensorSpec(
        shape=observation_shape, dtype=tf.float32, name='observation_spec')
    time_step_spec = time_step.time_step_spec(observation_spec)

    weight_var = tf.compat.v2.Variable(weights, dtype=tf.float32)
    inverse_temperature_var = tf.compat.v2.Variable(
        TEMP_UPDATE_TEST_INITIAL_INVERSE_TEMP, dtype=tf.float32)
    action_spec = tensor_spec.BoundedTensorSpec(
        shape=(),
        dtype=tf.int64,
        minimum=0,
        maximum=tf.compat.dimension_value(weight_var.shape[0]) - 1,
        name='action')
    policy = categorical_policy.CategoricalPolicy(weight_var, time_step_spec,
                                                  action_spec,
                                                  inverse_temperature_var)
    observation_step = _get_dummy_observation_step(observation_shape,
                                                   TEMP_UPDATE_TEST_BATCH_SIZE)
    tf.compat.v1.set_random_seed(seed)
    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Set the inverse temperature to a large value.
    self.evaluate(
        tf.compat.v1.assign(inverse_temperature_var,
                            TEMP_UPDATE_TEST_FINAL_INVERSE_TEMP))

    final_action_time_step = self.evaluate(
        policy.action(observation_step, seed=seed))
    self.assertAllEqual(
        final_action_time_step.action,
        np.full([TEMP_UPDATE_TEST_BATCH_SIZE], np.argmax(weights)))