Example #1
0
    def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature,
                      emit_log_probability):
        policy = categorical_q_policy.CategoricalQPolicy(
            time_step_spec,
            action_spec,
            self._q_network,
            self._min_q_value,
            self._max_q_value,
            observation_and_action_constraint_splitter=(
                self._observation_and_action_constraint_splitter))

        if boltzmann_temperature is not None:
            collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=boltzmann_temperature)
        else:
            collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        policy = greedy_policy.GreedyPolicy(policy)

        target_policy = categorical_q_policy.CategoricalQPolicy(
            time_step_spec,
            action_spec,
            self._target_q_network,
            self._min_q_value,
            self._max_q_value,
            observation_and_action_constraint_splitter=(
                self._observation_and_action_constraint_splitter))
        self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)

        return policy, collect_policy
Example #2
0
  def testMasking(self):
    batch_size = 1000
    num_state_dims = 5
    num_actions = 8
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_step = ts.restart(observations, batch_size=batch_size)
    input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    action_spec = tensor_spec.BoundedTensorSpec(
        [1], tf.int32, 0, num_actions - 1)

    # We create a fixed mask here for testing purposes. Normally the mask would
    # be part of the observation.
    mask = [0, 1, 0, 1, 0, 0, 1, 0]
    np_mask = np.array(mask)
    tf_mask = tf.constant([mask for _ in range(batch_size)])
    q_network = categorical_q_network.CategoricalQNetwork(
        input_tensor_spec=input_tensor_spec,
        action_spec=action_spec,
        num_atoms=3,
        fc_layer_params=[4])
    policy = categorical_q_policy.CategoricalQPolicy(
        self._time_step_spec, action_spec, q_network,
        self._min_q_value, self._max_q_value,
        observation_and_action_constraint_splitter=(
            lambda observation: (observation, tf_mask)))

    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Sample from the policy 1000 times, and ensure that actions considered
    # invalid according to the mask are never chosen.
    action_step = policy.action(time_step)
    action = self.evaluate(action_step.action)
    self.assertEqual(action.shape, (batch_size,))
    self.assertAllEqual(np_mask[action], np.ones([batch_size]))
Example #3
0
  def testMasking(self):
    batch_size = 1000
    num_state_dims = 5
    num_actions = 8
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_step = ts.restart(observations, batch_size=batch_size)
    input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    action_spec = tensor_spec.BoundedTensorSpec(
        [1], tf.int32, 0, num_actions - 1)

    mask = [0, 1, 0, 1, 0, 0, 1, 0]
    np_mask = np.array(mask)
    tf_mask = tf.constant([mask for _ in range(batch_size)])
    q_network = categorical_q_network.CategoricalQNetwork(
        input_tensor_spec=input_tensor_spec,
        action_spec=action_spec,
        num_atoms=3,
        mask_split_fn=lambda observation: (observation, tf_mask),
        fc_layer_params=[4])
    policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value,
                                                     self._max_q_value,
                                                     q_network,
                                                     action_spec)

    # Force creation of variables before global_variables_initializer.
    policy.variables()
    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Sample from the policy 1000 times and ensure that invalid actions are
    # never chosen.
    action_step = policy.action(time_step)
    action = self.evaluate(action_step.action)
    self.assertEqual(action.shape, (batch_size,))
    self.assertAllEqual(np_mask[action], np.ones([batch_size]))
Example #4
0
  def testUpdate(self):
    policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value,
                                                     self._max_q_value,
                                                     self._q_network,
                                                     self._action_spec)

    new_policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value,
                                                         self._max_q_value,
                                                         self._q_network,
                                                         self._action_spec)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_step = ts.restart(observations)

    # There should be two variables in our networks for the fc_layer we
    # specified (one kernel and one bias).
    self.assertLen(policy.variables(), 2)
    self.assertLen(new_policy.variables(), 2)

    actions, _, _ = policy.action(time_step)
    new_actions, _, _ = new_policy.action(time_step)

    self.assertEqual(actions.shape, new_actions.shape)
    self.assertEqual(actions.dtype, new_actions.dtype)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    actions = self.evaluate(actions)

    # actions should be a list of two elements; e.g., [0, 1]
    self.assertLen(actions, 2)

    for action in actions:
      self.assertGreaterEqual(action, self._action_spec.minimum)
      self.assertLessEqual(action, self._action_spec.maximum)

    self.assertEqual(self.evaluate(new_policy.update(policy)), None)
    new_actions = self.evaluate(new_actions)

    # new_actions should also be a list of two elements; e.g., [0, 1]
    self.assertLen(new_actions, 2)

    for action in new_actions:
      self.assertGreaterEqual(action, self._action_spec.minimum)
      self.assertLessEqual(action, self._action_spec.maximum)
Example #5
0
    def testBuild(self):
        policy = categorical_q_policy.CategoricalQPolicy(
            self._time_step_spec, self._action_spec, self._q_network,
            self._min_q_value, self._max_q_value)

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)

        # There should be two variables in our network for the fc_layer we specified
        # (one kernel and one bias).
        self.assertLen(policy.variables(), 2)
Example #6
0
  def testSaver(self):
    policy = categorical_q_policy.CategoricalQPolicy(
        self._time_step_spec, self._action_spec, self._q_network,
        self._min_q_value, self._max_q_value)

    saver = policy_saver.PolicySaver(policy)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.evaluate(tf.compat.v1.local_variables_initializer())

    save_path = os.path.join(flags.FLAGS.test_tmpdir,
                             'saved_categorical_q_policy')
    saver.save(save_path)
Example #7
0
 def testMultipleActionsRaiseError(self):
   with self.assertRaisesRegexp(
       TypeError, '.*action_spec must be a BoundedTensorSpec.*'):
     # Replace the action_spec for this test.
     action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)] * 2
     q_network = categorical_q_network.CategoricalQNetwork(
         input_tensor_spec=self._obs_spec,
         action_spec=action_spec,
         num_atoms=3,
         fc_layer_params=[4])
     categorical_q_policy.CategoricalQPolicy(
         self._time_step_spec, action_spec, q_network,
         self._min_q_value, self._max_q_value)
Example #8
0
    def testSaver(self):
        policy = categorical_q_policy.CategoricalQPolicy(
            self._time_step_spec, self._action_spec, self._q_network,
            self._min_q_value, self._max_q_value)

        train_step = tf.compat.v1.train.get_or_create_global_step()
        saver = policy_saver.PolicySaver(policy, train_step=train_step)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(tf.compat.v1.local_variables_initializer())

        save_path = os.path.join(flags.FLAGS.test_tmpdir,
                                 'saved_categorical_q_policy')

        # For TF1 Compatibility we set the cached session as default. This is a
        # no-op in TF2.
        with self.cached_session():
            saver.save(save_path)
Example #9
0
    def testSample(self):
        policy = categorical_q_policy.CategoricalQPolicy(
            self._time_step_spec, self._action_spec, self._q_network,
            self._min_q_value, self._max_q_value)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_step = ts.restart(observations)
        actions = policy.action(time_step).action
        self.assertEqual(actions.shape.as_list(), [2])
        self.assertEqual(actions.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions = self.evaluate(actions)

        # actions should be a list of two elements; e.g., [0, 1]
        self.assertLen(actions, 2)

        for action in actions:
            self.assertGreaterEqual(action, self._action_spec.minimum)
            self.assertLessEqual(action, self._action_spec.maximum)
    def testMultiSample(self):
        policy = categorical_q_policy.CategoricalQPolicy(
            self._min_q_value, self._max_q_value, self._q_network,
            self._action_spec)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_step = ts.restart(observations)
        actions, _ = policy.step(time_step, num_samples=2)
        self.assertEqual(actions.shape.as_list(), [2, 2])
        self.assertEqual(actions.dtype, tf.int32)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions = self.evaluate(actions)

        # actions should be a nested list of the form [[0, 1], [1, 0]]
        self.assertLen(actions, 2)

        for inner_list in actions:
            self.assertLen(inner_list, 2)

            for action in inner_list:
                self.assertGreaterEqual(action, self._action_spec.minimum)
                self.assertLessEqual(action, self._action_spec.maximum)
Example #11
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            categorical_q_network,
            optimizer,
            min_q_value=-10.0,
            max_q_value=10.0,
            epsilon_greedy=0.1,
            n_step_update=1,
            boltzmann_temperature=None,
            # Params for target network updates
            target_update_tau=1.0,
            target_update_period=1,
            # Params for training.
            td_errors_loss_fn=None,
            gamma=1.0,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for debugging
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Categorical DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A `BoundedTensorSpec` representing the actions.
      categorical_q_network: A categorical_q_network.CategoricalQNetwork that
        returns the q_distribution for each action.
      optimizer: The optimizer to use for training.
      min_q_value: A float specifying the minimum Q-value, used for setting up
        the support.
      max_q_value: A float specifying the maximum Q-value, used for setting up
        the support.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      TypeError: If the action spec contains more than one action.
    """
        num_atoms = getattr(categorical_q_network, 'num_atoms', None)
        if num_atoms is None:
            raise TypeError(
                'Expected categorical_q_network to have property '
                '`num_atoms`, but it doesn\'t (note: you likely want to '
                'use a CategoricalQNetwork). Network is: %s' %
                (categorical_q_network, ))

        self._num_atoms = num_atoms
        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._support = tf.linspace(min_q_value, max_q_value, num_atoms)

        super(CategoricalDqnAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             categorical_q_network,
                             optimizer,
                             epsilon_greedy=epsilon_greedy,
                             n_step_update=n_step_update,
                             boltzmann_temperature=boltzmann_temperature,
                             target_update_tau=target_update_tau,
                             target_update_period=target_update_period,
                             td_errors_loss_fn=td_errors_loss_fn,
                             gamma=gamma,
                             reward_scale_factor=reward_scale_factor,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             name=name)

        policy = categorical_q_policy.CategoricalQPolicy(
            min_q_value, max_q_value, self._q_network, self._action_spec)
        if boltzmann_temperature is not None:
            self._collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=self._boltzmann_temperature)
        else:
            self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        self._policy = greedy_policy.GreedyPolicy(policy)
Example #12
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            categorical_q_network,
            optimizer,
            min_q_value=-10.0,
            max_q_value=10.0,
            epsilon_greedy=0.1,
            n_step_update=1,
            boltzmann_temperature=None,
            # Params for target network updates
            target_categorical_q_network=None,
            target_update_tau=1.0,
            target_update_period=1,
            # Params for training.
            td_errors_loss_fn=None,
            gamma=1.0,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for debugging
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Categorical DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A `BoundedTensorSpec` representing the actions.
      categorical_q_network: A categorical_q_network.CategoricalQNetwork that
        returns the q_distribution for each action.
      optimizer: The optimizer to use for training.
      min_q_value: A float specifying the minimum Q-value, used for setting up
        the support.
      max_q_value: A float specifying the maximum Q-value, used for setting up
        the support.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      target_categorical_q_network: (Optional.)  A `tf_agents.network.Network`
        to be used as the target network during Q learning.  Every
        `target_update_period` train steps, the weights from
        `categorical_q_network` are copied (possibly with smoothing via
        `target_update_tau`) to `target_categorical_q_network`.

        If `target_categorical_q_network` is not provided, it is created by
        making a copy of `categorical_q_network`, which initializes a new
        network with the same structure and its own layers and weights.

        Network copying is performed via the `Network.copy` superclass method,
        and may inadvertently lead to the resulting network to share weights
        with the original.  This can happen if, for example, the original
        network accepted a pre-built Keras layer in its `__init__`, or
        accepted a Keras layer that wasn't built, but neglected to create
        a new copy.

        In these cases, it is up to you to provide a target Network having
        weights that are not shared with the original `categorical_q_network`.
        If you provide a `target_categorical_q_network` that shares any
        weights with `categorical_q_network`, a warning will be logged but
        no exception is thrown.

        Note; shallow copies of Keras layers may be built via the code:

        ```python
        new_layer = type(layer).from_config(layer.get_config())
        ```
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of huber_loss is used. This function takes as input the
        target and the estimated Q values and returns the loss for each element
        of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      TypeError: If the action spec contains more than one action.
    """
        super(CategoricalDqnAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             categorical_q_network,
                             optimizer,
                             epsilon_greedy=epsilon_greedy,
                             n_step_update=n_step_update,
                             boltzmann_temperature=boltzmann_temperature,
                             target_q_network=target_categorical_q_network,
                             target_update_tau=target_update_tau,
                             target_update_period=target_update_period,
                             td_errors_loss_fn=td_errors_loss_fn,
                             gamma=gamma,
                             reward_scale_factor=reward_scale_factor,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             name=name)

        def check_atoms(net, label):
            num_atoms = getattr(net, 'num_atoms', None)
            if num_atoms is None:
                raise TypeError(
                    'Expected {} to have property `num_atoms`, but it '
                    'doesn\'t (note: you likely want to use a '
                    'CategoricalQNetwork). Network is: {}'.format(label, net))
            return num_atoms

        num_atoms = check_atoms(self._q_network, 'categorical_q_network')
        target_num_atoms = check_atoms(self._target_q_network,
                                       'target_categorical_q_network')
        if num_atoms != target_num_atoms:
            raise ValueError(
                'categorical_q_network and target_categorical_q_network have '
                'different numbers of atoms: {} vs. {}'.format(
                    num_atoms, target_num_atoms))
        self._num_atoms = num_atoms
        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._support = tf.linspace(min_q_value, max_q_value, num_atoms)

        policy = categorical_q_policy.CategoricalQPolicy(
            min_q_value, max_q_value, self._q_network, self._action_spec)
        if boltzmann_temperature is not None:
            self._collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=self._boltzmann_temperature)
        else:
            self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        self._policy = greedy_policy.GreedyPolicy(policy)

        target_policy = categorical_q_policy.CategoricalQPolicy(
            min_q_value, max_q_value, self._target_q_network,
            self._action_spec)
        self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)