Esempio n. 1
0
def categorical_dist_td_learning(atoms_tm1,
                                 logits_v_tm1,
                                 r_t,
                                 pcont_t,
                                 atoms_t,
                                 logits_v_t,
                                 name="CategoricalDistTDLearning"):
  """Implements Distributional TD-learning as TensorFlow ops.

  The function assumes categorical value distributions parameterized by logits.

  See "A Distributional Perspective on Reinforcement Learning" by Bellemare,
  Dabney and Munos. (https://arxiv.org/abs/1707.06887).

  Args:
    atoms_tm1: 1-D tensor containing atom values for first timestep,
      shape `[num_atoms]`.
    logits_v_tm1: Tensor holding logits for first timestep in a batch of
      transitions, shape `[B, num_atoms]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    atoms_t: 1-D tensor containing atom values for second timestep,
      shape `[num_atoms]`.
    logits_v_t: Tensor holding logits for second timestep in a batch of
      transitions, shape `[B, num_atoms]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: Tensor containing the batch of losses, shape `[B]`.
    * `extra`: A namedtuple with fields:
        * `target`: Tensor containing the values that `v_tm1` are
        regressed towards, shape `[B, num_atoms]`.

  Raises:
    ValueError: If the tensors do not have the correct rank or compatibility.
  """
  # Rank and compatibility checks.
  assertion_lists = [[logits_v_tm1, logits_v_t], [r_t, pcont_t],
                     [atoms_tm1, atoms_t]]
  base_ops.wrap_rank_shape_assert(assertion_lists, [2, 1, 1], name)

  # Categorical distributional TD-learning op.
  with tf.name_scope(
      name, values=[atoms_tm1, logits_v_tm1, r_t, pcont_t, atoms_t,
                    logits_v_t]):

    with tf.name_scope("target"):
      # Scale and shift time-t distribution atoms by discount and reward.
      target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :]
      v_t_probs = tf.nn.softmax(logits_v_t)

      # Project using the Cramer distance
      target = tf.stop_gradient(_l2_project(target_z, v_t_probs, atoms_tm1))

    loss = tf.nn.softmax_cross_entropy_with_logits(
        logits=logits_v_tm1, labels=target)

    return base_ops.LossOutput(loss, Extra(target))
Esempio n. 2
0
def double_qlearning(q_tm1,
                     a_tm1,
                     r_t,
                     pcont_t,
                     q_t_value,
                     q_t_selector,
                     name="DoubleQLearning"):
    """Implements the double Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * q_t_value[argmax q_t_selector]`.

  See "Double Q-learning" by van Hasselt.
  (https://papers.nips.cc/paper/3964-double-q-learning.pdf).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape [B x num_actions].
    a_tm1: Tensor holding action indices, shape [B].
    r_t: Tensor holding rewards, shape [B].
    pcont_t: Tensor holding pcontinue values, shape [B].
    q_t_value: Tensor of Q-values for second timestep in a batch of transitions,
      used to estimate the value of the best action, shape [B x num_actions].
    q_t_selector: Tensor of Q-values for second timestep in a batch of
      transitions used to estimate the best action, shape [B x num_actions].
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B]
        * `td_error`: batch of temporal difference errors, shape [B]
        * `best_action`: batch of greedy actions wrt `q_t_selector`, shape [B]
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert(
        [[q_tm1, q_t_value, q_t_selector], [a_tm1, r_t, pcont_t]], [2, 1],
        name)

    # double Q-learning op.
    with tf.name_scope(
            name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value,
                          q_t_selector]):

        # Build target and select head to update.
        best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32)
        double_q_bootstrapped = indexing_ops.batched_index(
            q_t_value, best_action)
        target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped)
        qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - qa_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss,
                                   DoubleQExtra(target, td_error, best_action))
Esempio n. 3
0
def persistent_qlearning(q_tm1,
                         a_tm1,
                         r_t,
                         pcont_t,
                         q_t,
                         action_gap_scale=0.5,
                         name="PersistentQLearning"):
    """Implements the persistent Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  `r_t + pcont_t * [(1-action_gap_scale) max q_t + action_gap_scale qa_t]`

  See "Increasing the Action Gap: New Operators for Reinforcement Learning"
  by Bellemare, Ostrovski, Guez et al. (https://arxiv.org/abs/1512.04860).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape [B x num_actions].
    a_tm1: Tensor holding action indices, shape [B].
    r_t: Tensor holding rewards, shape [B].
    pcont_t: Tensor holding pcontinue values, shape [B].
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape [B x num_actions].
      These values are used for estimating the value of the best action. In
      DQN they come from the target network.
    action_gap_scale: coefficient in [0, 1] for scaling the action gap term.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
        * `td_error`: batch of temporal difference errors, shape [B].
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert([[q_tm1, q_t], [a_tm1, r_t, pcont_t]],
                                    [2, 1], name)
    base_ops.assert_arg_bounded(action_gap_scale, 0, 1, name,
                                "action_gap_scale")

    # persistent Q-learning op.
    with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

        # Build target and select head to update.
        with tf.name_scope("target"):
            max_q_t = tf.reduce_max(q_t, axis=1)
            qa_t = indexing_ops.batched_index(q_t, a_tm1)
            corrected_q_t = (
                1 - action_gap_scale) * max_q_t + action_gap_scale * qa_t
            target = tf.stop_gradient(r_t + pcont_t * corrected_q_t)
        qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - qa_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 4
0
def sarsa_lambda(q_tm1,
                 a_tm1,
                 r_t,
                 pcont_t,
                 q_t,
                 a_t,
                 lambda_,
                 name="SarsaLambda"):
  """Implements SARSA(lambda) loss as a TensorFlow op.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node77.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`.
    a_t: `Tensor` holding a sequence of action indices for second timestep;
      shape `[T, B]`
    lambda_: a scalar specifying the ratio of mixing between bootstrapped and
      MC returns.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t, a_t]], [3, 2], name)

  # SARSALambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]):

    # Select head to update and build target.
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
    qa_t = indexing_ops.batched_index(q_t, a_t)
    target = sequence_ops.multistep_forward_view(
        r_t, pcont_t, qa_t, lambda_, back_prop=False)
    target = tf.stop_gradient(target)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 5
0
def persistent_qlearning(
    q_tm1, a_tm1, r_t, pcont_t, q_t, action_gap_scale=0.5,
    name="PersistentQLearning"):
  """Implements the persistent Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  `r_t + pcont_t * [(1-action_gap_scale) max q_t + action_gap_scale qa_t]`

  See "Increasing the Action Gap: New Operators for Reinforcement Learning"
  by Bellemare, Ostrovski, Guez et al. (https://arxiv.org/abs/1512.04860).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
      These values are used for estimating the value of the best action. In
      DQN they come from the target network.
    action_gap_scale: coefficient in [0, 1] for scaling the action gap term.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t]], [2, 1], name)
  base_ops.assert_arg_bounded(action_gap_scale, 0, 1, name, "action_gap_scale")

  # persistent Q-learning op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      max_q_t = tf.reduce_max(q_t, axis=1)
      qa_t = indexing_ops.batched_index(q_t, a_tm1)
      corrected_q_t = (1 - action_gap_scale) * max_q_t + action_gap_scale * qa_t
      target = tf.stop_gradient(r_t + pcont_t * corrected_q_t)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 6
0
def double_qlearning(
    q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector,
    name="DoubleQLearning"):
  """Implements the double Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * q_t_value[argmax q_t_selector]`.

  See "Double Q-learning" by van Hasselt.
  (https://papers.nips.cc/paper/3964-double-q-learning.pdf).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t_value: Tensor of Q-values for second timestep in a batch of transitions,
      used to estimate the value of the best action, shape `[B x num_actions]`.
    q_t_selector: Tensor of Q-values for second timestep in a batch of
      transitions used to estimate the best action, shape `[B x num_actions]`.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`
        * `td_error`: batch of temporal difference errors, shape `[B]`
        * `best_action`: batch of greedy actions wrt `q_t_selector`, shape `[B]`
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t_value, q_t_selector], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # double Q-learning op.
  with tf.name_scope(
      name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector]):

    # Build target and select head to update.
    best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32)
    double_q_bootstrapped = indexing_ops.batched_index(q_t_value, best_action)
    target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(
        loss, DoubleQExtra(target, td_error, best_action))
Esempio n. 7
0
def qv_learning(q_tm1, a_tm1, r_t, pcont_t, v_t, name="QVLearning"):
  """Implements the QV loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * v_t`, where `v_t` is separately learned through
  temporal difference learning (c.f. `value_ops.td_learning`).

  See "Two Novel On-policy Reinforcement Learning Algorithms based on
  TD(lambda)-methods" by Wiering and van Hasselt
  (https://ieeexplore.ieee.org/abstract/document/4220845.)

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    v_t: Tensor holding state-values for second timestep in a batch of
      transitions, shape `[B]`.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1], [a_tm1, r_t, pcont_t, v_t]], [2, 1], name)

  # QV op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, v_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      target = tf.stop_gradient(r_t + pcont_t * v_t)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 8
0
def qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t, name="QLearning"):
  """Implements the Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * max q_t`.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node65.html).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # Q-learning op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      target = tf.stop_gradient(
          r_t + pcont_t * tf.reduce_max(q_t, axis=1))
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 9
0
def qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t, name="QLearning"):
  """Implements the Q-learning loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * max q_t`.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node65.html).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # Q-learning op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      target = tf.stop_gradient(
          r_t + pcont_t * tf.reduce_max(q_t, axis=1))
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 10
0
def qv_max(v_tm1, r_t, pcont_t, q_t, name="QVMAX"):
    """Implements the QVMAX learning loss as a TensorFlow op.

  The QVMAX loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * max q_t`, where `q_t` is separately learned
  through QV learning (c.f. `action_value_ops.qv_learning`).

  See "The QV Family Compared to Other Reinforcement Learning Algorithms" by
  Wiering and van Hasselt (2009).
  (http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.713.1931)

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor of action values at current timestep, shape `[B, num_actions]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert([[v_tm1, r_t, pcont_t], [q_t]], [1, 2],
                                    name)

    # The QVMAX op.
    with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, q_t]):

        # Build target.
        target = tf.stop_gradient(r_t + pcont_t * tf.reduce_max(q_t, axis=1))

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - v_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss, TDExtra(target, td_error))
Esempio n. 11
0
def qlambda(
    q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"):
  """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op.

  This function is general enough to implement both Peng's and Watkins'
  Q-lambda algorithms.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node78.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`. In a target network setting,
      this quantity is often supplied by the target network.
    lambda_: a scalar or `Tensor` of shape `[T, B]`
      specifying the ratio of mixing between bootstrapped and MC returns;
      if lambda_ is the same for all time steps then the function implements
      Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action
      and a constant otherwise, then the function implements Watkins'
      Q-learning algorithm. Generally lambda_ can be a Tensor of any values
      in the range [0, 1] supplied by the user.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name)
  if isinstance(lambda_, tf.Tensor) and lambda_.get_shape().ndims > 0:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name)
  else:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name)

  # QLambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      state_values = tf.reduce_max(q_t, axis=2)
      target = sequence_ops.multistep_forward_view(
          r_t, pcont_t, state_values, lambda_, back_prop=False)
      target = tf.stop_gradient(target)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 12
0
def td_learning(v_tm1, r_t, pcont_t, v_t, name="TDLearning"):
  """Implements the TD(0)-learning loss as a TensorFlow op.

  The TD loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * v_t`.

  See "Learning to Predict by the Methods of Temporal Differences" by Sutton.
  (https://link.springer.com/article/10.1023/A:1022633531479).

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    v_t: Tensor holding values at current timestep, shape `[B]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[v_tm1, v_t, r_t, pcont_t]], [1], name)

  # TD(0)-learning op.
  with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, v_t]):

    # Build target.
    target = tf.stop_gradient(r_t + pcont_t * v_t)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - v_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, TDExtra(target, td_error))
Esempio n. 13
0
def td_learning(v_tm1, r_t, pcont_t, v_t, name="TDLearning"):
    """Implements the TD(0)-learning loss as a TensorFlow op.

  The TD loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * v_t`.

  See "Learning to Predict by the Methods of Temporal Differences" by Sutton.
  (https://link.springer.com/article/10.1023/A:1022633531479).

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    v_t: Tensor holding values at current timestep, shape `[B]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
    # Rank and compatibility checks.
    base_ops.wrap_rank_shape_assert([[v_tm1, v_t, r_t, pcont_t]], [1], name)

    # TD(0)-learning op.
    with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, v_t]):

        # Build target.
        target = tf.stop_gradient(r_t + pcont_t * v_t)

        # Temporal difference error and loss.
        # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
        td_error = target - v_tm1
        loss = 0.5 * tf.square(td_error)
        return base_ops.LossOutput(loss, TDExtra(target, td_error))
Esempio n. 14
0
def qlambda(
    q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"):
  """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op.

  This function is general enough to implement both Peng's and Watkins'
  Q-lambda algorithms.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node78.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`. In a target network setting,
      this quantity is often supplied by the target network.
    lambda_: a scalar or `Tensor` of shape `[T, B]`
      specifying the ratio of mixing between bootstrapped and MC returns;
      if lambda_ is the same for all time steps then the function implements
      Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action
      and a constant otherwise, then the function implements Watkins'
      Q-learning algorithm. Generally lambda_ can be a Tensor of any values
      in the range [0, 1] supplied by the user.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name)
  if isinstance(
      lambda_, tf.Tensor
  ) and lambda_.get_shape().ndims is not None and lambda_.get_shape().ndims > 0:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name)
  else:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name)

  # QLambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      state_values = tf.reduce_max(q_t, axis=2)
      target = sequence_ops.multistep_forward_view(
          r_t, pcont_t, state_values, lambda_, back_prop=False)
      target = tf.stop_gradient(target)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 15
0
def categorical_dist_qlearning(atoms_tm1,
                               logits_q_tm1,
                               a_tm1,
                               r_t,
                               pcont_t,
                               atoms_t,
                               logits_q_t,
                               name="CategoricalDistQLearning"):
    """Implements Distributional Q-learning as TensorFlow ops.

  The function assumes categorical value distributions parameterized by logits.

  See "A Distributional Perspective on Reinforcement Learning" by Bellemare,
  Dabney and Munos. (https://arxiv.org/abs/1707.06887).

  Args:
    atoms_tm1: 1-D tensor containing atom values for first timestep,
      shape `[num_atoms]`.
    logits_q_tm1: Tensor holding logits for first timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    atoms_t: 1-D tensor containing atom values for second timestep,
      shape `[num_atoms]`.
    logits_q_t: Tensor holding logits for second timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: a tensor containing the values that `q_tm1` at actions
        `a_tm1` are regressed towards, shape `[B, num_atoms]`.

  Raises:
    ValueError: If the tensors do not have the correct rank or compatibility.
  """
    # Rank and compatibility checks.
    assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t],
                       [atoms_tm1, atoms_t]]
    base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1], name)

    # Categorical distributional Q-learning op.
    with tf.name_scope(name,
                       values=[
                           atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t,
                           atoms_t, logits_q_t
                       ]):

        with tf.name_scope("target"):
            # Scale and shift time-t distribution atoms by discount and reward.
            target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :]

            # Convert logits to distribution, then find greedy action in state s_t.
            q_t_probs = tf.nn.softmax(logits_q_t)
            q_t_mean = tf.reduce_sum(q_t_probs * atoms_t, 2)
            pi_t = tf.argmax(q_t_mean, 1, output_type=tf.int32)

            # Compute distribution for greedy action.
            p_target_z = _slice_with_actions(q_t_probs, pi_t)

            # Project using the Cramer distance
            target = tf.stop_gradient(
                _l2_project(target_z, p_target_z, atoms_tm1))

        logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logit_qa_tm1,
                                                       labels=target)

        return base_ops.LossOutput(loss, Extra(target))
Esempio n. 16
0
def categorical_dist_double_qlearning(atoms_tm1,
                                      logits_q_tm1,
                                      a_tm1,
                                      r_t,
                                      pcont_t,
                                      atoms_t,
                                      logits_q_t,
                                      q_t_selector,
                                      name="CategoricalDistDoubleQLearning"):
  """Implements Distributional Double Q-learning as TensorFlow ops.

  The function assumes categorical value distributions parameterized by logits,
  and combines distributional RL with double Q-learning.

  See "Rainbow: Combining Improvements in Deep Reinforcement Learning" by
  Hessel, Modayil, van Hasselt, Schaul et al.
  (https://arxiv.org/abs/1710.02298).

  Args:
    atoms_tm1: 1-D tensor containing atom values for first timestep,
      shape `[num_atoms]`.
    logits_q_tm1: Tensor holding logits for first timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    atoms_t: 1-D tensor containing atom values for second timestep,
      shape `[num_atoms]`.
    logits_q_t: Tensor holding logits for second timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    q_t_selector: Tensor holding another set of Q-values for second timestep
      in a batch of transitions, shape `[B, num_actions]`.
      These values are used for estimating the best action. In Double DQN they
      come from the online network.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: Tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`:  Tensor containing the values that `q_tm1` at actions
        `a_tm1` are regressed towards, shape `[B, num_atoms]` .

  Raises:
    ValueError: If the tensors do not have the correct rank or compatibility.
  """
  # Rank and compatibility checks.
  assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t],
                     [atoms_tm1, atoms_t], [q_t_selector]]
  base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1, 2], name)

  # Categorical distributional double Q-learning op.
  with tf.name_scope(
      name,
      values=[
          atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t, atoms_t, logits_q_t,
          q_t_selector
      ]):

    with tf.name_scope("target"):
      # Scale and shift time-t distribution atoms by discount and reward.
      target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :]

      # Convert logits to distribution, then find greedy policy action in
      # state s_t.
      q_t_probs = tf.nn.softmax(logits_q_t)
      pi_t = tf.argmax(q_t_selector, 1, output_type=tf.int32)
      # Compute distribution for greedy action.
      p_target_z = _slice_with_actions(q_t_probs, pi_t)

      # Project using the Cramer distance
      target = tf.stop_gradient(_l2_project(target_z, p_target_z, atoms_tm1))

    logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)

    loss = tf.nn.softmax_cross_entropy_with_logits(
        logits=logit_qa_tm1, labels=target)

    return base_ops.LossOutput(loss, Extra(target))
Esempio n. 17
0
def sarse(
    q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t, debug=False, name="Sarse"):
  """Implements the SARSE (Expected SARSA) loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * (sum_a probs_a_t[a] * q_t[a])`.

  See "A Theoretical and Empirical Analysis of Expected Sarsa" by Seijen,
  van Hasselt, Whiteson et al.
  (http://www.cs.ox.ac.uk/people/shimon.whiteson/pubs/vanseijenadprl09.pdf).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
    probs_a_t: Tensor holding action probabilities for second timestep,
      shape `[B x num_actions]`.
    debug: Boolean flag, when set to True adds ops to check whether probs_a_t
      is a batch of (approximately) valid probability distributions.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t, probs_a_t], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # SARSE (Expected SARSA) op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t]):

    # Debug ops.
    deps = []
    if debug:
      cumulative_prob = tf.reduce_sum(probs_a_t, axis=1)
      almost_prob = tf.less(tf.abs(tf.subtract(cumulative_prob, 1.0)), 1e-6)
      deps.append(tf.Assert(
          tf.reduce_all(almost_prob),
          ["probs_a_t tensor does not sum to 1", probs_a_t]))

    # With dependency on possible debug ops.
    with tf.control_dependencies(deps):

      # Select head to update and build target.
      qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
      target = tf.stop_gradient(
          r_t + pcont_t * tf.reduce_sum(tf.multiply(q_t, probs_a_t), axis=1))

      # Temporal difference error and loss.
      # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
      td_error = target - qa_tm1
      loss = 0.5 * tf.square(td_error)
      return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 18
0
def sarse(
    q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t, debug=False, name="Sarse"):
  """Implements the SARSE (Expected SARSA) loss as a TensorFlow op.

  The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and
  the target `r_t + pcont_t * (sum_a probs_a_t[a] * q_t[a])`.

  See "A Theoretical and Empirical Analysis of Expected Sarsa" by Seijen,
  van Hasselt, Whiteson et al.
  (http://www.cs.ox.ac.uk/people/shimon.whiteson/pubs/vanseijenadprl09.pdf).

  Args:
    q_tm1: Tensor holding Q-values for first timestep in a batch of
      transitions, shape `[B x num_actions]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor holding Q-values for second timestep in a batch of
      transitions, shape `[B x num_actions]`.
    probs_a_t: Tensor holding action probabilities for second timestep,
      shape `[B x num_actions]`.
    debug: Boolean flag, when set to True adds ops to check whether probs_a_t
      is a batch of (approximately) valid probability distributions.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t, probs_a_t], [a_tm1, r_t, pcont_t]], [2, 1], name)

  # SARSE (Expected SARSA) op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t]):

    # Debug ops.
    deps = []
    if debug:
      cumulative_prob = tf.reduce_sum(probs_a_t, axis=1)
      almost_prob = tf.less(tf.abs(tf.subtract(cumulative_prob, 1.0)), 1e-6)
      deps.append(tf.Assert(
          tf.reduce_all(almost_prob),
          ["probs_a_t tensor does not sum to 1", probs_a_t]))

    # With dependency on possible debug ops.
    with tf.control_dependencies(deps):

      # Select head to update and build target.
      qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
      target = tf.stop_gradient(
          r_t + pcont_t * tf.reduce_sum(tf.multiply(q_t, probs_a_t), axis=1))

      # Temporal difference error and loss.
      # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
      td_error = target - qa_tm1
      loss = 0.5 * tf.square(td_error)
      return base_ops.LossOutput(loss, QExtra(target, td_error))
Esempio n. 19
0
def categorical_dist_double_qlearning(atoms_tm1,
                                      logits_q_tm1,
                                      a_tm1,
                                      r_t,
                                      pcont_t,
                                      atoms_t,
                                      logits_q_t,
                                      q_t_selector,
                                      name="CategoricalDistDoubleQLearning"):
    """Implements Distributional Double Q-learning as TensorFlow ops.

  The function assumes categorical value distributions parameterized by logits,
  and combines distributional RL with double Q-learning.

  See "Rainbow: Combining Improvements in Deep Reinforcement Learning" by
  Hessel, Modayil, van Hasselt, Schaul et al.
  (https://arxiv.org/abs/1710.02298).

  Args:
    atoms_tm1: 1-D tensor containing atom values for first timestep,
      shape `[num_atoms]`.
    logits_q_tm1: Tensor holding logits for first timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    a_tm1: Tensor holding action indices, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    atoms_t: 1-D tensor containing atom values for second timestep,
      shape `[num_atoms]`.
    logits_q_t: Tensor holding logits for second timestep in a batch of
      transitions, shape `[B, num_actions, num_atoms]`.
    q_t_selector: Tensor holding another set of Q-values for second timestep
      in a batch of transitions, shape `[B, num_actions]`.
      These values are used for estimating the best action. In Double DQN they
      come from the online network.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: Tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`:  Tensor containing the values that `q_tm1` at actions
        `a_tm1` are regressed towards, shape `[B, num_atoms]` .

  Raises:
    ValueError: If the tensors do not have the correct rank or compatibility.
  """
    # Rank and compatibility checks.
    assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t],
                       [atoms_tm1, atoms_t], [q_t_selector]]
    base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1, 2], name)

    # Categorical distributional double Q-learning op.
    with tf.name_scope(name,
                       values=[
                           atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t,
                           atoms_t, logits_q_t, q_t_selector
                       ]):

        with tf.name_scope("target"):
            # Scale and shift time-t distribution atoms by discount and reward.
            target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :]

            # Convert logits to distribution, then find greedy policy action in
            # state s_t.
            q_t_probs = tf.nn.softmax(logits_q_t)
            pi_t = tf.argmax(q_t_selector, 1, output_type=tf.int32)
            # Compute distribution for greedy action.
            p_target_z = _slice_with_actions(q_t_probs, pi_t)

            # Project using the Cramer distance
            target = tf.stop_gradient(
                _l2_project(target_z, p_target_z, atoms_tm1))

        logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logit_qa_tm1,
                                                       labels=target)

        return base_ops.LossOutput(loss, Extra(target))