Beispiel #1
0
    def testInfoSpec(self):
        PolicyInfo = collections.namedtuple(  # pylint: disable=invalid-name
            'PolicyInfo',
            ('log_probability', 'predicted_rewards', 'bandit_policy_type'))
        # Set default empty tuple for all fields.
        PolicyInfo.__new__.__defaults__ = ((), ) * len(PolicyInfo._fields)

        info_spec = PolicyInfo(
            bandit_policy_type=self._bandit_policy_type_spec,
            log_probability=tensor_spec.BoundedTensorSpec(
                shape=(),
                dtype=tf.float32,
                maximum=0,
                minimum=-float('inf'),
                name='log_probability'))

        policy_with_info_spec = fixed_policy.FixedPolicy(
            np.asarray(self._greedy_action, dtype=np.int32),
            self._time_step_spec,
            self._action_spec,
            policy_info=PolicyInfo(
                bandit_policy_type=self._bandit_policy_type),
            info_spec=info_spec)

        epsilon = 0.2
        policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy_with_info_spec, epsilon=epsilon)
        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)

        time_step = tf.nest.map_structure(tf.convert_to_tensor,
                                          self._time_step)

        @common.function
        def action_step_fn(time_step=time_step):
            return policy.action(time_step, policy_state=(), seed=54)

        tf.nest.assert_same_structure(
            self._action_spec,
            self.evaluate(action_step_fn(time_step)).action)

        if tf.executing_eagerly():
            action_step = action_step_fn
        else:
            action_step = action_step_fn()

        step = self.evaluate(action_step)
        tf.nest.assert_same_structure(info_spec, step.info)

        self.checkBanditPolicyTypeShape(step.info.bandit_policy_type,
                                        batch_size=2)
 def setUp(self):
   super(EpsilonGreedyPolicyTest, self).setUp()
   self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._num_actions = 3
   self._greedy_action = 1
   self._action_spec = tensor_spec.BoundedTensorSpec((1,), tf.int32, 0,
                                                     self._num_actions-1)
   self._policy = fixed_policy.FixedPolicy(
       np.asarray([self._greedy_action], dtype=np.int32),
       self._time_step_spec,
       self._action_spec)
   observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
   self._time_step = ts.restart(observations, batch_size=2)
Beispiel #3
0
  def setUp(self):
    super(FixedPolicyTest, self).setUp()
    # Creates an MDP with:
    # - dim(observation) = 2
    # - number of actions = 4
    self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
    self._time_step_spec = ts.time_step_spec(self._obs_spec)
    self._num_actions = 4
    self._action_spec = tensor_spec.BoundedTensorSpec(
        shape=(1,), dtype=tf.int32,
        minimum=0, maximum=self._num_actions - 1)

    # The policy always outputs the same action.
    self._fixed_action = 1
    self._policy = fixed_policy.FixedPolicy(
        np.asarray([self._fixed_action], dtype=np.int32),
        self._time_step_spec,
        self._action_spec)
Beispiel #4
0
    def testInfoSpec(self):
        PolicyInfo = collections.namedtuple(  # pylint: disable=invalid-name
            'PolicyInfo', ('log_probability', 'predicted_rewards'))
        # Set default empty tuple for all fields.
        PolicyInfo.__new__.__defaults__ = ((), ) * len(PolicyInfo._fields)

        info_spec = PolicyInfo()
        policy_with_info_spec = fixed_policy.FixedPolicy(
            np.asarray([self._greedy_action], dtype=np.int32),
            self._time_step_spec,
            self._action_spec,
            policy_info=PolicyInfo(),
            info_spec=info_spec)

        epsilon = 0.2
        policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy_with_info_spec, epsilon=epsilon)
        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)

        time_step = tf.nest.map_structure(tf.convert_to_tensor,
                                          self._time_step)

        @common.function
        def action_step_fn(time_step=time_step):
            return policy.action(time_step, policy_state=(), seed=54)

        tf.nest.assert_same_structure(
            self._action_spec,
            self.evaluate(action_step_fn(time_step)).action)

        if tf.executing_eagerly():
            action_step = action_step_fn
        else:
            action_step = action_step_fn()

        step = self.evaluate(action_step)
        tf.nest.assert_same_structure(info_spec, step.info)