Example #1
0
def dpg(q_max, a_max, dqda_clipping=None, clip_norm=False, name="DpgLearning"):
  """Implements the Deterministic Policy Gradient (DPG) loss as a TensorFlow Op.

  This op implements the loss for the `actor`, the `critic` can instead be
  updated by minimizing the `value_ops.td_learning` loss.

  See "Deterministic Policy Gradient Algorithms" by Silver, Lever, Heess,
  Degris, Wierstra, Riedmiller (http://proceedings.mlr.press/v32/silver14.pdf).

  Args:
    q_max: Tensor holding Q-values generated by Q network with the input of
      (state, a_max) pair, shape `[B]`.
    a_max: Tensor holding the optimal action, shape `[B, action_dimension]`.
    dqda_clipping: `int` or `float`, clips the gradient dqda element-wise
      between `[-dqda_clipping, dqda_clipping]`.
    clip_norm: Whether to perform dqda clipping on the vector norm of the last
      dimension, or component wise (default).
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `q_max`: Tensor holding the optimal Q values, `[B]`.
        * `a_max`: Tensor holding the optimal action, `[B, action_dimension]`.
        * `dqda`: Tensor holding the derivative dq/da, `[B, action_dimension]`.

  Raises:
    ValueError: If `q_max` doesn't depend on `a_max` or if `dqda_clipping <= 0`.
  """

  # DPG op.
  with tf.name_scope(name, values=[q_max, a_max]):

    # Calculate the gradient dq/da.
    dqda = tf.gradients([q_max], [a_max])[0]

    # Check that `q_max` depends on `a_max`.
    if dqda is None:
      raise ValueError("q_max needs to be a function of a_max")

    # Clipping the gradient dq/da.
    if dqda_clipping is not None:
      if dqda_clipping <= 0:
        raise ValueError("dqda_clipping should be bigger than 0, {} found"
                         .format(dqda_clipping))
      if clip_norm:
        dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1)
      else:
        dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping)

    # Target_a ensures correct gradient calculated during backprop.
    target_a = dqda + a_max
    # Stop the gradient going through Q network when backprop.
    target_a = tf.stop_gradient(target_a)
    # Gradient only go through actor network.
    loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1)
    return base_ops.LossOutput(
        loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
Example #2
0
def double_qlearning(q_tm1,
                     a_tm1,
                     r_t,
                     pcont_t,
                     q_t_value,
                     q_t_selector,
                     name="DoubleQLearning"):
    """Implements the double Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * q_t_value[argmax q_t_selector]`.

  See "Double Q-learning" by van Hasselt.
  (https://papers.nips.cc/paper/3964-double-q-learning.pdf).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape [B x num_actions].
    a_tm1: Tensor holding action indices, shape [B].
    r_t: Tensor holding rewards, shape [B].
    pcont_t: Tensor holding pcontinue values, shape [B].
    q_t_value: Tensor of Q-values for second timestep in a batch of transitions,
      used to estimate the value of the best action, shape [B x num_actions].
    q_t_selector: Tensor of Q-values for second timestep in a batch of
      transitions used to estimate the best action, shape [B x num_actions].
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B]
        * `td_error`: batch of temporal difference errors, shape [B]
        * `best_action`: batch of greedy actions wrt `q_t_selector`, shape [B]
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert(
        [[q_tm1, q_t_value, q_t_selector], [a_tm1, r_t, pcont_t]], [2, 1],
        name)

    # double Q-learning op.
    with tf.name_scope(
            name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value,
                          q_t_selector]):

        # Build target and select head to update.
        best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32)
        double_q_bootstrapped = indexing_ops.batched_index(
            q_t_value, best_action)
        target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped)
        qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - qa_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss,
                                   DoubleQExtra(target, td_error, best_action))
Example #3
0
def persistent_qlearning(q_tm1,
                         a_tm1,
                         r_t,
                         pcont_t,
                         q_t,
                         action_gap_scale=0.5,
                         name="PersistentQLearning"):
    """Implements the persistent Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  `r_t + pcont_t * [(1-action_gap_scale) max q_t + action_gap_scale qa_t]`

  See "Increasing the Action Gap: New Operators for Reinforcement Learning"
  by Bellemare, Ostrovski, Guez et al. (https://arxiv.org/abs/1512.04860).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape [B x num_actions].
    a_tm1: Tensor holding action indices, shape [B].
    r_t: Tensor holding rewards, shape [B].
    pcont_t: Tensor holding pcontinue values, shape [B].
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape [B x num_actions].
      These values are used for estimating the value of the best action. In
      DQN they come from the target network.
    action_gap_scale: coefficient in [0, 1] for scaling the action gap term.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
        * `td_error`: batch of temporal difference errors, shape [B].
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert([[q_tm1, q_t], [a_tm1, r_t, pcont_t]],
                                    [2, 1], name)
    base_ops.assert_arg_bounded(action_gap_scale, 0, 1, name,
                                "action_gap_scale")

    # persistent Q-learning op.
    with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

        # Build target and select head to update.
        with tf.name_scope("target"):
            max_q_t = tf.reduce_max(q_t, axis=1)
            qa_t = indexing_ops.batched_index(q_t, a_tm1)
            corrected_q_t = (
                1 - action_gap_scale) * max_q_t + action_gap_scale * qa_t
            target = tf.stop_gradient(r_t + pcont_t * corrected_q_t)
        qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - qa_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss, QExtra(target, td_error))
Example #4
0
def dpg(q_max, a_max, dqda_clipping=None, clip_norm=False, name="DpgLearning"):
    """Implements the Deterministic Policy Gradient (DPG) loss as a TensorFlow Op.

  See "Continuous control with deep reinforcement learning" by Lillicrap, Hunt,
  Pritzel, Heess et al. (http://arxiv.org/pdf/1509.02971v5.pdf).

  Args:
    q_max: Tensor holding Q-values generated by Q network with the input of
      (state, a_max) pair, shape [B].
    a_max: Tensor holding the optimal action, shape[B, action_dimension].
    dqda_clipping: `int` or `float`, clips the gradient dqda element-wise
      between [-dqda_clipping, dqda_clipping].
    clip_norm: Whether to perform dqda clipping on the vector norm of the last
      dimension, or component wise (default).
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

      * `loss`: a tensor containing the batch of losses, shape [B].
      * `extra`: a namedtuple with fields:
          * `q_max`: Tensor holding the optimal Q values, [B].
          * `a_max`: Tensor holding the optimal action, [B, action_dimension].
          * `dqda`: Tensor holding the derivative dq/da, [B, action_dimension].

  Raises:
    ValueError: If q_max doesn't depend on a_max or if dqda_clipping <= 0.
  """

    # DPG op.
    with tf.name_scope(name, values=[q_max, a_max]):

        # Calculate the gradient dq/da.
        dqda = tf.gradients([q_max], [a_max])[0]

        # Check that `q_max` depends on `a_max`.
        if dqda is None:
            raise ValueError("q_max needs to be a function of a_max")

        # Clipping the gradient dq/da.
        if dqda_clipping is not None:
            if dqda_clipping <= 0:
                raise ValueError(
                    "dqda_clipping should be bigger than 0, {} found".format(
                        dqda_clipping))
            if clip_norm:
                dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1)
            else:
                dqda = tf.clip_by_value(dqda, -1. * dqda_clipping,
                                        dqda_clipping)

        # Target_a ensures correct gradient calculated during backprop.
        target_a = dqda + a_max
        # Stop the gradient going through Q network when backprop.
        target_a = tf.stop_gradient(target_a)
        # Gradient only go through actor network.
        loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1)
        return base_ops.LossOutput(
            loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
def discrete_policy_entropy_loss(policy_logits,
                                 normalise=False,
                                 name="discrete_policy_entropy_loss"):
    """Computes the entropy 'loss' for a batch of policy logits.

  Given a batch of policy logits, calculates the entropy and corrects the sign
  so that minimizing the resulting loss op is equivalent to increasing entropy
  in the batch. This loss is optionally normalised to the range `[-1, 0]` by
  dividing by the log number of actions. This makes it more invariant to the
  size of the action space.

  This function accepts a nested array of `policy_logits` in order
  to allow for multiple discrete actions. In this case, the loss is given by
  `-sum_i(H(p_i))` where `p_i` are members of the `policy_logits` nest and
  H is the Shannon entropy.

  Args:
    policy_logits: A (possibly nested structure of) (N+1)-D Tensor(s) with
        shape `[..., A]`,  representing the log-probabilities of a set of
        Categorical distributions, where `...` represents at least one
        dimension (e.g., batch, sequence), and `A` is the number of discrete
        actions (which need not be identical across all tensors).
        Does not need to be centered.
    normalise: If True, divide the loss by the `sum_i(log(A_i))` where `A_i`
        is the number of actions for the i'th tensor in the `policy_logits`
        nest. Default is False.
    name: Optional, name of this op.

  Returns:
    A namedtuple with fields:

    * `loss`: Entropy 'loss', shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: Entropy of the policy, shape `[B]`.
  """
    policy_logits = nest.flatten(policy_logits)

    with tf.name_scope(name, values=policy_logits):
        entropy = tf.add_n([
            tf.reduce_sum(-tf.nn.softmax(scalar_policy_logits) *
                          tf.nn.log_softmax(scalar_policy_logits),
                          axis=-1) for scalar_policy_logits in policy_logits
        ],
                           name="entropy")
        # We want a value that we can minimize along with other losses, and where
        # minimizing means driving the policy towards a uniform distribution over
        # the actions. We thus scale it by negative one so that it can be simply
        # added to other losses.
        scale = tf.constant(-1.0, dtype=tf.float32)
        if normalise:
            num_actions = [
                tf.to_float(tf.shape(scalar_policy_logits)[-1])
                for scalar_policy_logits in policy_logits
            ]
            scale /= tf.reduce_sum(tf.log(tf.stack(num_actions)))
        loss = tf.multiply(scale, entropy, name="entropy_loss")

    return base_ops.LossOutput(loss, DiscretePolicyEntropyExtra(entropy))
Example #6
0
def sarsa_lambda(q_tm1,
                 a_tm1,
                 r_t,
                 pcont_t,
                 q_t,
                 a_t,
                 lambda_,
                 name="SarsaLambda"):
  """Implements SARSA(lambda) loss as a TensorFlow op.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node77.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`.
    a_t: `Tensor` holding a sequence of action indices for second timestep;
      shape `[T, B]`
    lambda_: a scalar specifying the ratio of mixing between bootstrapped and
      MC returns.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t, a_t]], [3, 2], name)

  # SARSALambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]):

    # Select head to update and build target.
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
    qa_t = indexing_ops.batched_index(q_t, a_t)
    target = sequence_ops.multistep_forward_view(
        r_t, pcont_t, qa_t, lambda_, back_prop=False)
    target = tf.stop_gradient(target)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Example #7
0
def qv_learning(q_tm1, a_tm1, r_t, pcont_t, v_t, name="QVLearning"):
  """Implements the QV loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * v_t`, where `v_t` is separately learned through
  temporal difference learning (c.f. `value_ops.td_learning`).

  See "Two Novel On-policy Reinforcement Learning Algorithms based on
  TD(lambda)-methods" by Wiering and van Hasselt
  (https://ieeexplore.ieee.org/abstract/document/4220845.)

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    v_t: Tensor holding state-values for second timestep in a batch of
      transitions, shape `[B]`.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1], [a_tm1, r_t, pcont_t, v_t]], [2, 1], name)

  # QV op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, v_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      target = tf.stop_gradient(r_t + pcont_t * v_t)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Example #8
0
def qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t, name="QLearning"):
  """Implements the Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * max q_t`.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node65.html).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # Q-learning op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      target = tf.stop_gradient(
          r_t + pcont_t * tf.reduce_max(q_t, axis=1))
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Example #9
0
def qv_max(v_tm1, r_t, pcont_t, q_t, name="QVMAX"):
    """Implements the QVMAX learning loss as a TensorFlow op.

  The QVMAX loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * max q_t`, where `q_t` is separately learned
  through QV learning (c.f. `action_value_ops.qv_learning`).

  See "The QV Family Compared to Other Reinforcement Learning Algorithms" by
  Wiering and van Hasselt (2009).
  (http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.713.1931)

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor of action values at current timestep, shape `[B, num_actions]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert([[v_tm1, r_t, pcont_t], [q_t]], [1, 2],
                                    name)

    # The QVMAX op.
    with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, q_t]):

        # Build target.
        target = tf.stop_gradient(r_t + pcont_t * tf.reduce_max(q_t, axis=1))

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - v_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss, TDExtra(target, td_error))
Example #10
0
def td_learning(v_tm1, r_t, pcont_t, v_t, name="TDLearning"):
    """Implements the TD(0)-learning loss as a TensorFlow op.

  The TD loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * v_t`.

  See "Learning to Predict by the Methods of Temporal Differences" by Sutton.
  (https://link.springer.com/article/10.1023/A:1022633531479).

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    v_t: Tensor holding values at current timestep, shape `[B]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert([[v_tm1, v_t, r_t, pcont_t]], [1], name)

    # TD(0)-learning op.
    with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, v_t]):

        # Build target.
        target = tf.stop_gradient(r_t + pcont_t * v_t)

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - v_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss, TDExtra(target, td_error))
Example #11
0
def retrace(lambda_,
            qs,
            targnet_qs,
            actions,
            rewards,
            pcontinues,
            target_policy_probs,
            behaviour_policy_probs,
            stop_targnet_gradients=True,
            name=None):
    """Retrace algorithm loss calculation op.

  Given a minibatch of temporally-contiguous sequences of Q values, policy
  probabilities, and various other typical RL algorithm inputs, this
  Op creates a subgraph that computes a loss according to the
  Retrace multi-step off-policy value learning algorithm. This Op supports the
  use of target networks, but does not require them.

  For more details of Retrace, refer to
  [the arXiv paper](http://arxiv.org/abs/1606.02647).

  In argument descriptions, `T` counts the number of transitions over which
  the Retrace loss is computed, and `B` is the minibatch size. Note that all
  tensor arguments list a first-dimension (time dimension) size of T+1;
  this is because in order to compute the loss over T timesteps, the
  algorithm must be aware of the values of many of its inputs at timesteps
  before and after each transition.

  All tensor arguments are indexed first by transition, with specific
  details of this indexing in the argument descriptions.

  Args:
    lambda_: Positive scalar value or 0-D `Tensor` controlling the degree to
      which future timesteps contribute to the loss computed at each
      transition.
    qs: 3-D tensor holding per-action Q-values for the states encountered
      just before taking the transitions that correspond to each major index.
      Since these values are the predicted values we wish to update (in other
      words, the values we intend to change as we learn), in a target network
      setting, these nearly always come from the "non-target" network, which
      we usually call the "learning network".
      Shape is `[(T+1), B, num_actions]`.
    targnet_qs: Like `qs`, but in the target network setting, these values
      should be computed by the target network. We use these values to
      compute multi-step error values for timesteps that follow the first
      timesteps in each sequence and sequence fragment we consider.
      Shape is `[(T+1), B, num_actions]`.
    actions: 2-D tensor holding the indices of actions executed during the
      transition that corresponds to each major index.
      Shape is `[(T+1), B]`.
    rewards: 2-D tensor holding rewards received during the transition
      that corresponds to each major index.
      Shape is `[(T+1), B]`.
    pcontinues: 2-D tensor holding pcontinue values received during the
      transition that corresponds to each major index.
      Shape is `[(T+1), B]`.
    target_policy_probs: 3-D tensor holding per-action policy probabilities
      for the states encountered just before taking the transitions that
      correspond to each major index, according to the target policy (i.e.
      the policy we wish to learn). These probabilities usually derive from
      the learning net.
      Shape is `[(T+1), B, num_actions]`.
    behaviour_policy_probs: 2-D tensor holding the *behaviour* policy's
      probabilities of having taken actions `action` during the transitions
      that correspond to each major index. These probabilities derive from
      whatever policy you used to generate the data.
      Shape is `[(T+1), B]`.
    stop_targnet_gradients: `bool` that enables a sensible default way of
      handling gradients through the Retrace op (essentially, gradients
      are not permitted to involve the `targnet_qs` inputs). Can be disabled
      if you require a different arrangement, but you'll probably want to
      block some gradients somewhere.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: Tensor containing the batch of losses, shape `[B]`.
    * `extra`: None
  """
    all_args = [
        lambda_, qs, targnet_qs, actions, rewards, pcontinues,
        target_policy_probs, behaviour_policy_probs
    ]
    with tf.name_scope(name, 'Retrace', values=all_args):
        # Mainly to simplify testing:
        (lambda_, qs, targnet_qs, actions, rewards, pcontinues,
         target_policy_probs,
         behaviour_policy_probs) = (tf.convert_to_tensor(arg)
                                    for arg in all_args)

        # Require correct tensor ranks---as long as we have shape information
        # available to check. If there isn't any, we print a warning.
        def check_rank(tensors, ranks):
            for i, (tensor, rank) in enumerate(zip(tensors, ranks)):
                if tensor.get_shape():
                    base_ops.assert_rank_and_shape_compatibility([tensor],
                                                                 rank)
                else:
                    tf.logging.error(
                        'Tensor "%s", which was offered as Retrace parameter %d, has '
                        'no rank at construction time, so Retrace can\'t verify that '
                        'it has the necessary rank of %d', tensor.name, i + 1,
                        rank)

        check_rank([
            lambda_, qs, targnet_qs, actions, rewards, pcontinues,
            target_policy_probs, behaviour_policy_probs
        ], [0, 3, 3, 2, 2, 2, 3, 2])

        # Deduce the shapes of the arguments we'll create for retrace_core.
        qs_shape = tf.shape(qs)
        timesteps = qs_shape[0]  # Batch size is qs_shape[1].

        # Deduce the time indices for the arguments we'll create for retrace_core.
        timestep_indices_tm1 = tf.range(0, timesteps - 1)
        timestep_indices_t = tf.range(1, timesteps)

        # Construct arguments for retrace_core and call.
        q_tm1 = tf.gather(qs, timestep_indices_tm1)
        a_tm1 = tf.gather(actions, timestep_indices_tm1)

        r_t = tf.gather(rewards, timestep_indices_tm1)
        pcont_t = tf.gather(pcontinues, timestep_indices_tm1)

        target_policy_t = tf.gather(target_policy_probs, timestep_indices_t)
        behaviour_policy_t = tf.gather(behaviour_policy_probs,
                                       timestep_indices_t)
        targnet_q_t = tf.gather(targnet_qs, timestep_indices_t)
        a_t = tf.gather(actions, timestep_indices_t)

        core = retrace_core(lambda_, q_tm1, a_tm1, r_t, pcont_t,
                            target_policy_t, behaviour_policy_t, targnet_q_t,
                            a_t, stop_targnet_gradients)

        return base_ops.LossOutput(core.loss, None)
Example #12
0
def retrace_core(lambda_,
                 q_tm1,
                 a_tm1,
                 r_t,
                 pcont_t,
                 target_policy_t,
                 behaviour_policy_t,
                 targnet_q_t,
                 a_t,
                 stop_targnet_gradients=True,
                 name=None):
    """Retrace algorithm core loss calculation op.

  Given a minibatch of temporally-contiguous sequences of Q values, policy
  probabilities, and various other typical RL algorithm inputs, this
  Op creates a subgraph that computes a loss according to the
  Retrace multi-step off-policy value learning algorithm. This Op supports the
  use of target networks, but does not require them.

  This function is the "core" Retrace op only because its arguments are less
  user-friendly and more implementation-convenient. For a more user-friendly
  operator, consider using `retrace`. For more details of Retrace, refer to
  [the arXiv paper](http://arxiv.org/abs/1606.02647).

  Construct the "core" retrace loss subgraph for a batch of sequences.

  Note that two pairs of arguments (one holding target network values; the
  other, actions) are temporally-offset versions of each other and will share
  many values in common (nb: a good setting for using `IndexedSlices`). *This
  op does not include any checks that these pairs of arguments are
  consistent*---that is, it does not ensure that temporally-offset
  arguments really do share the values they are supposed to share.

  In argument descriptions, `T` counts the number of transitions over which
  the Retrace loss is computed, and `B` is the minibatch size. All tensor
  arguments are indexed first by transition, with specific details of this
  indexing in the argument descriptions (pay close attention to "subscripts"
  in variable names).

  Args:
    lambda_: Positive scalar value or 0-D `Tensor` controlling the degree to
      which future timesteps contribute to the loss computed at each
      transition.
    q_tm1: 3-D tensor holding per-action Q-values for the states encountered
      just before taking the transitions that correspond to each major index.
      Since these values are the predicted values we wish to update (in other
      words, the values we intend to change as we learn), in a target network
      setting, these nearly always come from the "non-target" network, which
      we usually call the "learning network".
      Shape is `[T, B, num_actions]`.
    a_tm1: 2-D tensor holding the indices of actions executed during the
      transition that corresponds to each major index.
      Shape is `[T, B]`.
    r_t: 2-D tensor holding rewards received during the transition
      that corresponds to each major index.
      Shape is `[T, B]`.
    pcont_t: 2-D tensor holding pcontinue values received during the
      transition that corresponds to each major index.
      Shape is `[T, B]`.
    target_policy_t: 3-D tensor holding per-action policy probabilities for
      the states encountered just AFTER the transitions that correspond to
      each major index, according to the target policy (i.e. the policy we
      wish to learn). These usually derive from the learning net.
      Shape is `[T, B, num_actions]`.
    behaviour_policy_t: 2-D tensor holding the *behaviour* policy's
      probabilities of having taken action `a_t` at the states encountered
      just AFTER the transitions that correspond to each major index. Derived
      from whatever policy you used to generate the data. All values MUST be
      greater that 0. Shape is `[T, B]`.
    targnet_q_t: 3-D tensor holding per-action Q-values for the states
      encountered just AFTER taking the transitions that correspond to each
      major index. Since these values are used to calculate target values for
      the network, in a target in a target network setting, these should
      probably come from the target network.
      Shape is `[T, B, num_actions]`.
    a_t: 2-D tensor holding the indices of actions executed during the
      transition AFTER the transition that corresponds to each major index.
      Shape is `[T, B]`.
    stop_targnet_gradients: `bool` that enables a sensible default way of
      handling gradients through the Retrace op (essentially, gradients
      are not permitted to involve the `targnet_q_t` input).
      Can be disabled if you require a different arragement, but
      you'll probably want to block some gradients somewhere.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: Tensor containing the batch of losses, shape `[B]`.
    * `extra`: A namedtuple with fields:
        * `retrace_weights`: Tensor containing batch of retrace weights,
        shape `[T, B]`.
        * `target`: Tensor containing target action values, shape `[T, B]`.
  """
    all_args = [
        lambda_, q_tm1, a_tm1, r_t, pcont_t, target_policy_t,
        behaviour_policy_t, targnet_q_t, a_t
    ]

    with tf.name_scope(name, 'RetraceCore', all_args):
        (lambda_, q_tm1, a_tm1, r_t, pcont_t, target_policy_t,
         behaviour_policy_t, targnet_q_t, a_t) = (tf.convert_to_tensor(arg)
                                                  for arg in all_args)

        # Evaluate importance weights.
        c_t = _retrace_weights(
            indexing_ops.batched_index(target_policy_t, a_t),
            behaviour_policy_t) * lambda_
        # Targets are evaluated by using only Q values from the target network.
        # This provides fixed regression targets until the next target network
        # update.
        target = _general_off_policy_corrected_multistep_target(
            r_t, pcont_t, target_policy_t, c_t, targnet_q_t, a_t,
            not stop_targnet_gradients)

        if stop_targnet_gradients:
            target = tf.stop_gradient(target)
        # Regress Q values of the learning network towards the targets evaluated
        # by using the target network.
        qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
        delta = target - qa_tm1
        loss = 0.5 * tf.square(delta)

        return base_ops.LossOutput(
            loss, RetraceCoreExtra(retrace_weights=c_t, target=target))
Example #13
0
def categorical_dist_qlearning(atoms_tm1,
                               logits_q_tm1,
                               a_tm1,
                               r_t,
                               pcont_t,
                               atoms_t,
                               logits_q_t,
                               name="CategoricalDistQLearning"):
    """Implements Distributional Q-learning as TensorFlow ops.

  The function assumes categorical value distributions parameterized by logits.

  See "A Distributional Perspective on Reinforcement Learning" by Bellemare,
  Dabney and Munos. (https://arxiv.org/abs/1707.06887).

  Args:
    atoms_tm1: 1-D tensor containing atom values for first timestep,
      shape `[num_atoms]`.
    logits_q_tm1: Tensor holding logits for first timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    atoms_t: 1-D tensor containing atom values for second timestep,
      shape `[num_atoms]`.
    logits_q_t: Tensor holding logits for second timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: a tensor containing the values that `q_tm1` at actions
        `a_tm1` are regressed towards, shape `[B, num_atoms]`.

  Raises:
    ValueError: If the tensors do not have the correct rank or compatibility.
  """
    # Rank and compatibility checks.
    assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t],
                       [atoms_tm1, atoms_t]]
    base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1], name)

    # Categorical distributional Q-learning op.
    with tf.name_scope(name,
                       values=[
                           atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t,
                           atoms_t, logits_q_t
                       ]):

        with tf.name_scope("target"):
            # Scale and shift time-t distribution atoms by discount and reward.
            target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :]

            # Convert logits to distribution, then find greedy action in state s_t.
            q_t_probs = tf.nn.softmax(logits_q_t)
            q_t_mean = tf.reduce_sum(q_t_probs * atoms_t, 2)
            pi_t = tf.argmax(q_t_mean, 1, output_type=tf.int32)

            # Compute distribution for greedy action.
            p_target_z = _slice_with_actions(q_t_probs, pi_t)

            # Project using the Cramer distance
            target = tf.stop_gradient(
                _l2_project(target_z, p_target_z, atoms_tm1))

        logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logit_qa_tm1,
                                                       labels=target)

        return base_ops.LossOutput(loss, Extra(target))
def sequence_advantage_actor_critic_loss(
    policy_logits, baseline_values, actions, rewards,
    pcontinues, bootstrap_value, lambda_=1, entropy_cost=None,
    baseline_cost=1, normalise_entropy=False,
    name="SequenceAdvantageActorCriticLoss"):
  """Calculates the loss for an A2C update along a batch of trajectories.

  Technically A2C is the special case where lambda=1; for general lambda
  this is the loss for Generalized Advantage Estimation (GAE), modulo chunking
  behaviour if passing chunks of episodes (see `generalized_lambda_returns` for
  more detail).

  Note: This function takes policy _logits_ as input, not the log-policy like
  `learning.deepmind.lua.rl.learners.Reinforce` does.

  This loss jointly learns the policy and the baseline. Therefore, gradients
  for this loss flow through each tensor in `policy_logits` and
  `baseline_values`, but no other input tensors. The policy is learnt with the
  advantage actor-critic loss, plus an optional entropy term. The baseline is
  regressed towards the n-step bootstrapped returns given by the
  reward/pcontinue sequence.  The `baseline_cost` parameter scales the
  gradients w.r.t the baseline relative to the policy gradient. i.e:
  `d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `baseline_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories as observed by the agent given the
  sequences of one or more actions sampled from the `policy_logits`.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action `a` transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  This function accepts a nested array of `policy_logits` and `actions` in order
  to allow for multidimensional discrete action spaces. In this case, the loss
  is given by `sum_i(loss(p_i, a_i))` where `p_i` are members of the
  `policy_logits` nest, and `a_i` are members of the `actions` nest.
  We assume that a single baseline is used across all action dimensions for
  each timestep.

  Args:
    policy_logits: A (possibly nested structure of) 3-D Tensor(s) with shape
        `[T, B, num_actions]` and possibly different dimension `num_actions`.
    baseline_values: 2-D Tensor containing an estimate of state values `[T, B]`.
    actions: A (possibly nested structure of) 2-D Tensor(s) with shape
        `[T, B]` and integer type.
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for
        Generalised Advantage Estimation as per
        https://arxiv.org/abs/1506.02438.
    entropy_cost: optional scalar cost that pushes the policy to have high
        entropy, larger values cause higher entropies.
    baseline_cost: scalar cost that scales the derivatives of the baseline
        relative to the policy gradient.
    normalise_entropy: if True, the entropy loss is normalised to the range
        `[-1, 0]` by dividing by the log number of actions. This makes it more
        invariant to the size of the action space. Default is False.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
  scoped_values = (nest.flatten(policy_logits) + nest.flatten(actions) +
                   [baseline_values, rewards, pcontinues, bootstrap_value])
  with tf.name_scope(name, values=scoped_values):
    # Loss for the baseline, summed over the time dimension.
    baseline_loss_td, td_lambda = value_ops.td_lambda(
        baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

    # The TD error provides an estimate of the advantages of the actions.
    advantages = td_lambda.temporal_differences
    baseline_loss = tf.multiply(
        tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
        baseline_loss_td,
        name="baseline_loss")

    # Loss for the policy. Doesn't push additional gradients through
    # the advantages.
    policy_gradient_loss = discrete_policy_gradient_loss(
        policy_logits, actions, advantages, name="policy_gradient_loss")

    total_loss = tf.add(policy_gradient_loss, baseline_loss, name="total_loss")

    if entropy_cost is not None:
      entropy_loss_op, policy_entropy = discrete_policy_entropy_loss(
          policy_logits, normalise=normalise_entropy)  # [T,B].
      entropy = tf.reduce_sum(
          policy_entropy.entropy, axis=0, name="entropy")  # [B].
      entropy_loss = tf.multiply(
          tf.convert_to_tensor(entropy_cost, dtype=tf.float32),
          tf.reduce_sum(entropy_loss_op, axis=0),
          name="scaled_entropy_loss")  # [B].
      total_loss = tf.add(total_loss, entropy_loss,
                          name="total_loss_with_entropy")
    else:
      entropy = None
      entropy_loss = None

    extra = SequenceAdvantageActorCriticExtra(
        entropy=entropy, entropy_loss=entropy_loss,
        baseline_loss=baseline_loss,
        policy_gradient_loss=policy_gradient_loss,
        advantages=advantages,
        discounted_returns=td_lambda.discounted_returns)

    return base_ops.LossOutput(total_loss, extra)
Example #15
0
def pixel_control_loss(
    observations, actions, action_values, cell_size, discount_factor,
    scale, crop_height_dim=(None, None), crop_width_dim=(None, None)):
  """Calculate n-step Q-learning loss for pixel control auxiliary task.

  For each pixel-based pseudo reward signal, the corresponding action-value
  function is trained off-policy, using Q(lambda). A discount of 0.9 is
  commonly used for learning the value functions.

  Note that, since pseudo rewards have a spatial structure, with neighbouring
  cells exhibiting strong correlations, it is convenient to predict the action
  values for all the cells through a deconvolutional head.

  See "Reinforcement Learning with Unsupervised Auxiliary Tasks" by Jaderberg,
  Mnih, Czarnecki et al. (https://arxiv.org/abs/1611.05397).

  Args:
    observations: A tensor of shape `[T+1,B, ...]`; `...` is the observation
      shape, `T` the sequence length, and `B` the batch size. `T` and `B` can
      be statically unknown for `observations`, `actions` and `action_values`.
    actions: A tensor, shape `[T,B]`, of the actions across each sequence.
    action_values: A tensor, shape `[T+1,B,H,W,N]` of pixel control action
      values, where `H`, `W` are the number of pixel control cells/tasks, and
      `N` is the number of actions.
    cell_size: size of the cells used to derive the pixel based pseudo-rewards.
    discount_factor: discount used for learning the value function associated
      to the pseudo rewards; must be a scalar or a Tensor of shape [T,B].
    scale: scale factor for pixels in `observations`.
    crop_height_dim: tuple (min_height, max_height) specifying how
      to crop the input observations before computing the pseudo-rewards.
    crop_width_dim: tuple (min_width, max_width) specifying how
      to crop the input observations before computing the pseudo-rewards.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
        * `td_error`: batch of temporal difference errors, shape [B].

  Raises:
    ValueError: if the shape of `action_values` is not compatible with that of
      the pseudo-rewards derived from the observations.
  """
  # Useful shapes.
  sequence_length, batch_size = base_ops.best_effort_shape(actions)
  num_actions = action_values.get_shape().as_list()[-1]
  height_width_q = action_values.get_shape().as_list()[2:-1]
  # Calculate rewards using the observations. Crop observations if appropriate.
  if crop_height_dim[0] is not None:
    h_low, h_high = crop_height_dim
    observations = observations[:, :, h_low:h_high, :]
  if crop_width_dim[0] is not None:
    w_low, w_high = crop_width_dim
    observations = observations[:, :, :, w_low:w_high]
  # Rescale observations by a constant factor.
  observations *= tf.constant(scale)
  # Compute pseudo-rewards and get their shape.
  pseudo_rewards = pixel_control_rewards(observations, cell_size)
  height_width = pseudo_rewards.get_shape().as_list()[2:]
  # Check that pseudo-rewards and Q-values are compatible in shape.
  if height_width != height_width_q:
    raise ValueError(
        "Pixel Control values are not compatible with the shape of the"
        "pseudo-rewards derived from the observation. Pseudo-rewards have shape"
        "{}, while Pixel Control values have shape {}".format(
            height_width, height_width_q))
  # We now have Q(s,a) and rewards, so can calculate the n-step loss. The
  # QLambda loss op expects inputs of shape [T,B,N] and [T,B], but our tensors
  # are in a variety of incompatible shapes. The state-action values have
  # shape [T,B,H,W,N] and rewards have shape [T,B,H,W]. We can think of the
  # [H,W] dimensions as extra batch dimensions for the purposes of the loss
  # calculation, so we first collapse [B,H,W] into a single dimension.
  q_tm1 = tf.reshape(
      action_values[:-1],  # [T,B,H,W,N].
      [sequence_length, -1, num_actions],
      name="q_tm1")  # [T,BHW,N].
  r_t = tf.reshape(
      pseudo_rewards,  # [T,B,H,W].
      [sequence_length, -1],
      name="r_t")  # [T,BHW].
  q_t = tf.reshape(
      action_values[1:],  # [T,B,H,W,N].
      [sequence_length, -1, num_actions],
      name="q_t")  # [T,BHW,N].
  # The actions tensor is of shape [T,B], and is the same for each H and W.
  # We thus expand it to be same shape as the reward tensor, [T,BHW].
  expanded_actions = tf.expand_dims(tf.expand_dims(actions, -1), -1)
  a_tm1 = tf.tile(
      expanded_actions, multiples=[1, 1] + height_width)  # [T,B,H,W].
  a_tm1 = tf.reshape(a_tm1, [sequence_length, -1])  # [T,BHW].
  # We similarly expand-and-tile the discount to [T,BHW].
  discount_factor = tf.convert_to_tensor(discount_factor)
  if discount_factor.shape.ndims == 0:
    pcont_t = tf.reshape(discount_factor, [1, 1])  # [1,1].
    pcont_t = tf.tile(pcont_t, tf.shape(a_tm1))  # [T,BHW].
  elif discount_factor.shape.ndims == 2:
    tiled_pcont = tf.tile(
        tf.expand_dims(tf.expand_dims(discount_factor, -1), -1),
        [1, 1] + height_width)
    pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1])
  else:
    raise ValueError(
        "The discount_factor must be a scalar or a tensor of rank 2."
        "instead is a tensor of shape {}".format(
            discount_factor.shape.as_list()))
  # Compute a QLambda loss of shape [T,BHW]
  loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1)
  # Take sum over sequence, sum over cells.
  expanded_shape = [sequence_length, batch_size] + height_width
  spatial_loss = tf.reshape(loss, expanded_shape)  # [T,B,H,W].
  # Return.
  extra = PixelControlExtra(
      spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards)
  return base_ops.LossOutput(
      tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra)  # [B]
Example #16
0
def td_lambda(state_values,
              rewards,
              pcontinues,
              bootstrap_value,
              lambda_=1,
              name="BaselineLoss"):
    """Constructs a TensorFlow graph computing the L2 loss for sequences.

  This loss learns the baseline for advantage actor-critic models. Gradients
  for this loss flow through each tensor in `state_values`, but no other
  input tensors. The baseline is regressed towards the n-step bootstrapped
  returns given by the reward/pcontinue sequence.

  This function is designed for batches of sequences of data. Tensors are
  assumed to be time major (i.e. the outermost dimension is time, the second
  outermost dimension is the batch dimension). We denote the sequence length
  in the shapes of the arguments with the variable `T`, the batch size with
  the variable `B`, neither of which needs to be known at construction time.
  Index `0` of the time dimension is assumed to be the start of the sequence.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `state_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  See "High-dimensional continuous control using generalized advantage
  estimation" by Schulman, Moritz, Levine et al.
  (https://arxiv.org/abs/1506.02438).

  Args:
    state_values: 2-D Tensor of state-value estimates with shape `[T, B]`.
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * temporal_differences, Tensor of shape `[T, B]`
        * discounted_returns, Tensor of shape `[T, B]`
  """
    scoped_values = [state_values, rewards, pcontinues, bootstrap_value]
    with tf.name_scope(name, values=scoped_values):
        discounted_returns = generalized_lambda_returns(
            rewards, pcontinues, state_values, bootstrap_value, lambda_)
        temporal_differences = discounted_returns - state_values
        loss = 0.5 * tf.reduce_sum(
            tf.square(temporal_differences), axis=0, name="l2_loss")

        return base_ops.LossOutput(
            loss,
            TDLambdaExtra(temporal_differences=temporal_differences,
                          discounted_returns=discounted_returns))
def sequence_a2c_loss(policies,
                      baseline_values,
                      actions,
                      rewards,
                      pcontinues,
                      bootstrap_value,
                      policy_vars=None,
                      lambda_=1,
                      entropy_cost=None,
                      baseline_cost=1,
                      entropy_scale_op=None,
                      name="SequenceA2CLoss"):
    """Constructs a TensorFlow graph computing the A2C/GAE loss for sequences.

  This loss jointly learns the policy and the baseline. Therefore, gradients
  for this loss flow through each tensor in `policies` and through each tensor
  in `baseline_values`, but no other input tensors. The policy is learnt with
  the advantage actor-critic loss, plus an optional entropy term. The baseline
  is regressed towards the n-step bootstrapped returns given by the
  reward/pcontinue sequence. The `baseline_cost` parameter scales the
  gradients w.r.t the baseline relative to the policy gradient, i.e.
  d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`.

  This function is designed for batches of sequences of data. Tensors are
  assumed to be time major (i.e. the outermost dimension is time, the second
  outermost dimension is the batch dimension). We denote the sequence length in
  the shapes of the arguments with the variable `T`, the batch size with the
  variable `B`, neither of which needs to be known at construction time. Index
  `0` of the time dimension is assumed to be the start of the sequence.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `baseline_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories as observed by the agent given the
  sequences of one or more actions sampled from `policies`.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action `a` transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  For n-dimensional action vectors, a multivariate distribution must be used
  for `policies`. In case there is no multivariate version for the desired
  univariate distribution, or in case the `actions` object is a nested
  structure (e.g. for multiple action types), this function also accepts a
  nested structure  of `policies`. In this case, the loss is given by
  `sum_i(loss(p_i, a_i))` where `p_i` are members of the `policies` nest, and
  `a_i` are members of the `actions` nest. We assume that a single baseline is
  used across all action dimensions for each timestep.

  Args:
    policies: A (possibly nested structure of) distribution(s) supporting
        `batch_shape` and `event_shape` properties & `log_prob` and `entropy`
        methods (e.g. an instance of `tfp.distributions.Distribution`),
        with `batch_shape` equal to `[T, B]`. E.g. for a (non-nested) diagonal
        multivariate gaussian with dimension `A` this would be:
        `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)`
        where `mus` and `sigmas` have shape `[T, B, A]`.
    baseline_values: 2-D Tensor containing an estimate of the state value with
        shape `[T, B]`.
    actions: A (possibly nested structure of) N-D Tensor(s) with shape
        `[T, B, ...]` where the final dimensions are the `event_shape` of the
        corresponding distribution in the nested structure (the shape can be
        just `[T, B]` if the `event_shape` is scalar).
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    policy_vars: An optional (possibly nested structure of) iterables of
        Tensors used by `policies`. If provided is used in scope checks. For
        the multivariate normal example above this would be `[mus, sigmas]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for
        Generalised Advantage Estimation as per
        https://arxiv.org/abs/1506.02438.
    entropy_cost: optional scalar cost that pushes the policy to have high
        entropy, larger values cause higher entropies.
    baseline_cost: scalar cost that scales the derivatives of the baseline
        relative to the policy gradient.
    entropy_scale_op: An optional op that takes `policies` as its only
        argument and returns a scalar Tensor that is used to scale the entropy
        loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of
        dimensions makes entropy loss invariant to the action space dimension.
        See `policy_entropy_loss` for more info.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
    flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
    scoped_values = (flat_policy_vars + nest.flatten(actions) +
                     [baseline_values, rewards, pcontinues, bootstrap_value])
    with tf.name_scope(name, values=scoped_values):
        # Loss for the baseline, summed over the time dimension.
        baseline_loss_td, td_lambda = value_ops.td_lambda(
            baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

        # The TD error provides an estimate of the advantages of the actions.
        advantages = td_lambda.temporal_differences
        baseline_loss = tf.multiply(tf.convert_to_tensor(baseline_cost,
                                                         dtype=tf.float32),
                                    baseline_loss_td,
                                    name="baseline_loss")

        # Loss for the policy. Doesn't push additional gradients through
        # the advantages.
        pg_loss = policy_gradient_loss(policies,
                                       actions,
                                       advantages,
                                       policy_vars,
                                       name="policy_gradient_loss")

        total_loss = tf.add(pg_loss, baseline_loss, name="total_loss")

        if entropy_cost is not None:
            loss, extra = policy_entropy_loss(policies, policy_vars,
                                              entropy_scale_op)
            entropy = tf.reduce_sum(extra.entropy, axis=0,
                                    name="entropy")  # [B].
            entropy_loss = tf.multiply(tf.convert_to_tensor(entropy_cost,
                                                            dtype=tf.float32),
                                       tf.reduce_sum(loss, axis=0),
                                       name="scaled_entropy_loss")  # [B].
            total_loss = tf.add(total_loss,
                                entropy_loss,
                                name="total_loss_with_entropy")
        else:
            entropy = None
            entropy_loss = None

        extra = SequenceA2CExtra(
            entropy=entropy,
            entropy_loss=entropy_loss,
            baseline_loss=baseline_loss,
            policy_gradient_loss=pg_loss,
            advantages=advantages,
            discounted_returns=td_lambda.discounted_returns)
        return base_ops.LossOutput(total_loss, extra)
def policy_entropy_loss(policies,
                        policy_vars=None,
                        scale_op=None,
                        name="policy_entropy_loss"):
    """Calculates entropy 'loss' for policies represented by a distributions.

  Given a (possible nested structure of) batch(es) of policies, this
  calculates the total entropy and corrects the sign so that minimizing the
  resulting loss op is equivalent to increasing entropy in the batch.

  This function accepts a nested structure of `policies` in order to allow for
  multiple distribution types or for multiple action dimensions in the case
  where there is no corresponding mutivariate form for available for a given
  univariate distribution. In this case, the loss is `sum_i(H(p_i, p_i))`
  where `p_i` are members of the `policies` nest. It can be shown that this is
  equivalent to calculating the entropy loss on the Cartesian product space
  over all the action dimensions, if the sampled actions are independent.

  The entropy loss is optionally scaled by some function of the policies.
  E.g. for Categorical distributions there exists such a scaling which maps
  the entropy loss into the range `[-1, 0]` in order to make it invariant to
  the size of the action space - specifically one can divide the loss by
  `sum_i(log(A_i))` where `A_i` is the number of categories in the i'th
  Categorical distribution in the `policies` nest).

  Args:
    policies: A (possibly nested structure of) batch distribution(s)
        supporting an `entropy` method that returns an N-D Tensor with shape
        equal to the `batch_shape` of the distribution, e.g. an instance of
        `tfp.distributions.Distribution`.
    policy_vars: An optional (possibly nested structure of) iterable(s) of
        Tensors used by `policies`. If provided is used in scope checks.
    scale_op: An optional op that takes `policies` as its only argument and
        returns a scalar Tensor that is used to scale the entropy loss.
        E.g. for Diag(sigma) Gaussian policies dividing by the number of
        dimensions makes entropy loss invariant to the action space dimension.
    name: Optional, name of this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B1, B2, ...]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: entropy of the policy, shape `[B1, B2, ...]`.
    where [B1, B2, ... ] == policy.batch_shape
  """
    flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
    with tf.name_scope(name, values=flat_policy_vars):
        # We want a value that we can minimize along with other losses, and where
        # minimizing means driving the policy towards a uniform distribution over
        # the actions. We thus scale it by negative one so that it can be simply
        # added to other losses.
        scale = tf.constant(-1.0, dtype=tf.float32)
        if scale_op:
            scale *= scale_op(policies)

        policies = nest.flatten(policies)
        entropy = tf.add_n([policy.entropy() for policy in policies],
                           name="entropy")
        loss = tf.multiply(scale, entropy, name="entropy_loss")
        return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy))
Example #19
0
def categorical_dist_double_qlearning(atoms_tm1,
                                      logits_q_tm1,
                                      a_tm1,
                                      r_t,
                                      pcont_t,
                                      atoms_t,
                                      logits_q_t,
                                      q_t_selector,
                                      name="CategoricalDistDoubleQLearning"):
    """Implements Distributional Double Q-learning as TensorFlow ops.

  The function assumes categorical value distributions parameterized by logits,
  and combines distributional RL with double Q-learning.

  See "Rainbow: Combining Improvements in Deep Reinforcement Learning" by
  Hessel, Modayil, van Hasselt, Schaul et al.
  (https://arxiv.org/abs/1710.02298).

  Args:
    atoms_tm1: 1-D tensor containing atom values for first timestep,
      shape `[num_atoms]`.
    logits_q_tm1: Tensor holding logits for first timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    atoms_t: 1-D tensor containing atom values for second timestep,
      shape `[num_atoms]`.
    logits_q_t: Tensor holding logits for second timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    q_t_selector: Tensor holding another set of Q-values for second timestep
      in a batch of transitions, shape `[B, num_actions]`.
      These values are used for estimating the best action. In Double DQN they
      come from the online network.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: Tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`:  Tensor containing the values that `q_tm1` at actions
        `a_tm1` are regressed towards, shape `[B, num_atoms]` .

  Raises:
    ValueError: If the tensors do not have the correct rank or compatibility.
  """
    # Rank and compatibility checks.
    assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t],
                       [atoms_tm1, atoms_t], [q_t_selector]]
    base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1, 2], name)

    # Categorical distributional double Q-learning op.
    with tf.name_scope(name,
                       values=[
                           atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t,
                           atoms_t, logits_q_t, q_t_selector
                       ]):

        with tf.name_scope("target"):
            # Scale and shift time-t distribution atoms by discount and reward.
            target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :]

            # Convert logits to distribution, then find greedy policy action in
            # state s_t.
            q_t_probs = tf.nn.softmax(logits_q_t)
            pi_t = tf.argmax(q_t_selector, 1, output_type=tf.int32)
            # Compute distribution for greedy action.
            p_target_z = _slice_with_actions(q_t_probs, pi_t)

            # Project using the Cramer distance
            target = tf.stop_gradient(
                _l2_project(target_z, p_target_z, atoms_tm1))

        logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logit_qa_tm1,
                                                       labels=target)

        return base_ops.LossOutput(loss, Extra(target))
Example #20
0
def sarse(
    q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t, debug=False, name="Sarse"):
  """Implements the SARSE (Expected SARSA) loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * (sum_a probs_a_t[a] * q_t[a])`.

  See "A Theoretical and Empirical Analysis of Expected Sarsa" by Seijen,
  van Hasselt, Whiteson et al.
  (http://www.cs.ox.ac.uk/people/shimon.whiteson/pubs/vanseijenadprl09.pdf).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
    probs_a_t: Tensor holding action probabilities for second timestep,
      shape `[B x num_actions]`.
    debug: Boolean flag, when set to True adds ops to check whether probs_a_t
      is a batch of (approximately) valid probability distributions.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t, probs_a_t], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # SARSE (Expected SARSA) op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t]):

    # Debug ops.
    deps = []
    if debug:
      cumulative_prob = tf.reduce_sum(probs_a_t, axis=1)
      almost_prob = tf.less(tf.abs(tf.subtract(cumulative_prob, 1.0)), 1e-6)
      deps.append(tf.Assert(
          tf.reduce_all(almost_prob),
          ["probs_a_t tensor does not sum to 1", probs_a_t]))

    # With dependency on possible debug ops.
    with tf.control_dependencies(deps):

      # Select head to update and build target.
      qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
      target = tf.stop_gradient(
          r_t + pcont_t * tf.reduce_sum(tf.multiply(q_t, probs_a_t), axis=1))

      # Temporal difference error and loss.
      # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
      td_error = target - qa_tm1
      loss = 0.5 * tf.square(td_error)
      return base_ops.LossOutput(loss, QExtra(target, td_error))
Example #21
0
def qlambda(
    q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"):
  """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op.

  This function is general enough to implement both Peng's and Watkins'
  Q-lambda algorithms.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node78.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`. In a target network setting,
      this quantity is often supplied by the target network.
    lambda_: a scalar or `Tensor` of shape `[T, B]`
      specifying the ratio of mixing between bootstrapped and MC returns;
      if lambda_ is the same for all time steps then the function implements
      Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action
      and a constant otherwise, then the function implements Watkins'
      Q-learning algorithm. Generally lambda_ can be a Tensor of any values
      in the range [0, 1] supplied by the user.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name)
  if isinstance(
      lambda_, tf.Tensor
  ) and lambda_.get_shape().ndims is not None and lambda_.get_shape().ndims > 0:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name)
  else:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name)

  # QLambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      state_values = tf.reduce_max(q_t, axis=2)
      target = sequence_ops.multistep_forward_view(
          r_t, pcont_t, state_values, lambda_, back_prop=False)
      target = tf.stop_gradient(target)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))