def dpg(q_max, a_max, dqda_clipping=None, clip_norm=False, name="DpgLearning"): """Implements the Deterministic Policy Gradient (DPG) loss as a TensorFlow Op. This op implements the loss for the `actor`, the `critic` can instead be updated by minimizing the `value_ops.td_learning` loss. See "Deterministic Policy Gradient Algorithms" by Silver, Lever, Heess, Degris, Wierstra, Riedmiller (http://proceedings.mlr.press/v32/silver14.pdf). Args: q_max: Tensor holding Q-values generated by Q network with the input of (state, a_max) pair, shape `[B]`. a_max: Tensor holding the optimal action, shape `[B, action_dimension]`. dqda_clipping: `int` or `float`, clips the gradient dqda element-wise between `[-dqda_clipping, dqda_clipping]`. clip_norm: Whether to perform dqda clipping on the vector norm of the last dimension, or component wise (default). name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `q_max`: Tensor holding the optimal Q values, `[B]`. * `a_max`: Tensor holding the optimal action, `[B, action_dimension]`. * `dqda`: Tensor holding the derivative dq/da, `[B, action_dimension]`. Raises: ValueError: If `q_max` doesn't depend on `a_max` or if `dqda_clipping <= 0`. """ # DPG op. with tf.name_scope(name, values=[q_max, a_max]): # Calculate the gradient dq/da. dqda = tf.gradients([q_max], [a_max])[0] # Check that `q_max` depends on `a_max`. if dqda is None: raise ValueError("q_max needs to be a function of a_max") # Clipping the gradient dq/da. if dqda_clipping is not None: if dqda_clipping <= 0: raise ValueError("dqda_clipping should be bigger than 0, {} found" .format(dqda_clipping)) if clip_norm: dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1) else: dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping) # Target_a ensures correct gradient calculated during backprop. target_a = dqda + a_max # Stop the gradient going through Q network when backprop. target_a = tf.stop_gradient(target_a) # Gradient only go through actor network. loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1) return base_ops.LossOutput( loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
def double_qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector, name="DoubleQLearning"): """Implements the double Q-learning loss as a TensorFlow op. The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and the target `r_t + pcont_t * q_t_value[argmax q_t_selector]`. See "Double Q-learning" by van Hasselt. (https://papers.nips.cc/paper/3964-double-q-learning.pdf). Args: q_tm1: Tensor holding Q-values for first timestep in a batch of transitions, shape [B x num_actions]. a_tm1: Tensor holding action indices, shape [B]. r_t: Tensor holding rewards, shape [B]. pcont_t: Tensor holding pcontinue values, shape [B]. q_t_value: Tensor of Q-values for second timestep in a batch of transitions, used to estimate the value of the best action, shape [B x num_actions]. q_t_selector: Tensor of Q-values for second timestep in a batch of transitions used to estimate the best action, shape [B x num_actions]. name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape [B]. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B] * `td_error`: batch of temporal difference errors, shape [B] * `best_action`: batch of greedy actions wrt `q_t_selector`, shape [B] """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert( [[q_tm1, q_t_value, q_t_selector], [a_tm1, r_t, pcont_t]], [2, 1], name) # double Q-learning op. with tf.name_scope( name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector]): # Build target and select head to update. best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32) double_q_bootstrapped = indexing_ops.batched_index( q_t_value, best_action) target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, DoubleQExtra(target, td_error, best_action))
def persistent_qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t, action_gap_scale=0.5, name="PersistentQLearning"): """Implements the persistent Q-learning loss as a TensorFlow op. The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and `r_t + pcont_t * [(1-action_gap_scale) max q_t + action_gap_scale qa_t]` See "Increasing the Action Gap: New Operators for Reinforcement Learning" by Bellemare, Ostrovski, Guez et al. (https://arxiv.org/abs/1512.04860). Args: q_tm1: Tensor holding Q-values for first timestep in a batch of transitions, shape [B x num_actions]. a_tm1: Tensor holding action indices, shape [B]. r_t: Tensor holding rewards, shape [B]. pcont_t: Tensor holding pcontinue values, shape [B]. q_t: Tensor holding Q-values for second timestep in a batch of transitions, shape [B x num_actions]. These values are used for estimating the value of the best action. In DQN they come from the target network. action_gap_scale: coefficient in [0, 1] for scaling the action gap term. name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape [B]. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B]. * `td_error`: batch of temporal difference errors, shape [B]. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[q_tm1, q_t], [a_tm1, r_t, pcont_t]], [2, 1], name) base_ops.assert_arg_bounded(action_gap_scale, 0, 1, name, "action_gap_scale") # persistent Q-learning op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]): # Build target and select head to update. with tf.name_scope("target"): max_q_t = tf.reduce_max(q_t, axis=1) qa_t = indexing_ops.batched_index(q_t, a_tm1) corrected_q_t = ( 1 - action_gap_scale) * max_q_t + action_gap_scale * qa_t target = tf.stop_gradient(r_t + pcont_t * corrected_q_t) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def dpg(q_max, a_max, dqda_clipping=None, clip_norm=False, name="DpgLearning"): """Implements the Deterministic Policy Gradient (DPG) loss as a TensorFlow Op. See "Continuous control with deep reinforcement learning" by Lillicrap, Hunt, Pritzel, Heess et al. (http://arxiv.org/pdf/1509.02971v5.pdf). Args: q_max: Tensor holding Q-values generated by Q network with the input of (state, a_max) pair, shape [B]. a_max: Tensor holding the optimal action, shape[B, action_dimension]. dqda_clipping: `int` or `float`, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. clip_norm: Whether to perform dqda clipping on the vector norm of the last dimension, or component wise (default). name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape [B]. * `extra`: a namedtuple with fields: * `q_max`: Tensor holding the optimal Q values, [B]. * `a_max`: Tensor holding the optimal action, [B, action_dimension]. * `dqda`: Tensor holding the derivative dq/da, [B, action_dimension]. Raises: ValueError: If q_max doesn't depend on a_max or if dqda_clipping <= 0. """ # DPG op. with tf.name_scope(name, values=[q_max, a_max]): # Calculate the gradient dq/da. dqda = tf.gradients([q_max], [a_max])[0] # Check that `q_max` depends on `a_max`. if dqda is None: raise ValueError("q_max needs to be a function of a_max") # Clipping the gradient dq/da. if dqda_clipping is not None: if dqda_clipping <= 0: raise ValueError( "dqda_clipping should be bigger than 0, {} found".format( dqda_clipping)) if clip_norm: dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1) else: dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping) # Target_a ensures correct gradient calculated during backprop. target_a = dqda + a_max # Stop the gradient going through Q network when backprop. target_a = tf.stop_gradient(target_a) # Gradient only go through actor network. loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1) return base_ops.LossOutput( loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
def discrete_policy_entropy_loss(policy_logits, normalise=False, name="discrete_policy_entropy_loss"): """Computes the entropy 'loss' for a batch of policy logits. Given a batch of policy logits, calculates the entropy and corrects the sign so that minimizing the resulting loss op is equivalent to increasing entropy in the batch. This loss is optionally normalised to the range `[-1, 0]` by dividing by the log number of actions. This makes it more invariant to the size of the action space. This function accepts a nested array of `policy_logits` in order to allow for multiple discrete actions. In this case, the loss is given by `-sum_i(H(p_i))` where `p_i` are members of the `policy_logits` nest and H is the Shannon entropy. Args: policy_logits: A (possibly nested structure of) (N+1)-D Tensor(s) with shape `[..., A]`, representing the log-probabilities of a set of Categorical distributions, where `...` represents at least one dimension (e.g., batch, sequence), and `A` is the number of discrete actions (which need not be identical across all tensors). Does not need to be centered. normalise: If True, divide the loss by the `sum_i(log(A_i))` where `A_i` is the number of actions for the i'th tensor in the `policy_logits` nest. Default is False. name: Optional, name of this op. Returns: A namedtuple with fields: * `loss`: Entropy 'loss', shape `[B]`. * `extra`: a namedtuple with fields: * `entropy`: Entropy of the policy, shape `[B]`. """ policy_logits = nest.flatten(policy_logits) with tf.name_scope(name, values=policy_logits): entropy = tf.add_n([ tf.reduce_sum(-tf.nn.softmax(scalar_policy_logits) * tf.nn.log_softmax(scalar_policy_logits), axis=-1) for scalar_policy_logits in policy_logits ], name="entropy") # We want a value that we can minimize along with other losses, and where # minimizing means driving the policy towards a uniform distribution over # the actions. We thus scale it by negative one so that it can be simply # added to other losses. scale = tf.constant(-1.0, dtype=tf.float32) if normalise: num_actions = [ tf.to_float(tf.shape(scalar_policy_logits)[-1]) for scalar_policy_logits in policy_logits ] scale /= tf.reduce_sum(tf.log(tf.stack(num_actions))) loss = tf.multiply(scale, entropy, name="entropy_loss") return base_ops.LossOutput(loss, DiscretePolicyEntropyExtra(entropy))
def sarsa_lambda(q_tm1, a_tm1, r_t, pcont_t, q_t, a_t, lambda_, name="SarsaLambda"): """Implements SARSA(lambda) loss as a TensorFlow op. See "Reinforcement Learning: An Introduction" by Sutton and Barto. (http://incompleteideas.net/book/ebook/node77.html). Args: q_tm1: `Tensor` holding a sequence of Q-values starting at the first timestep; shape `[T, B, num_actions]` a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]` r_t: Tensor holding a sequence of rewards, shape `[T, B]` pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]` q_t: `Tensor` holding a sequence of Q-values for second timestep; shape `[T, B, num_actions]`. a_t: `Tensor` holding a sequence of action indices for second timestep; shape `[T, B]` lambda_: a scalar specifying the ratio of mixing between bootstrapped and MC returns. name: a name of the op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[T, B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`. * `td_error`: batch of temporal difference errors, shape `[T, B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert( [[q_tm1, q_t], [a_tm1, r_t, pcont_t, a_t]], [3, 2], name) # SARSALambda op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]): # Select head to update and build target. qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) qa_t = indexing_ops.batched_index(q_t, a_t) target = sequence_ops.multistep_forward_view( r_t, pcont_t, qa_t, lambda_, back_prop=False) target = tf.stop_gradient(target) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def qv_learning(q_tm1, a_tm1, r_t, pcont_t, v_t, name="QVLearning"): """Implements the QV loss as a TensorFlow op. The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and the target `r_t + pcont_t * v_t`, where `v_t` is separately learned through temporal difference learning (c.f. `value_ops.td_learning`). See "Two Novel On-policy Reinforcement Learning Algorithms based on TD(lambda)-methods" by Wiering and van Hasselt (https://ieeexplore.ieee.org/abstract/document/4220845.) Args: q_tm1: Tensor holding Q-values for first timestep in a batch of transitions, shape `[B x num_actions]`. a_tm1: Tensor holding action indices, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. v_t: Tensor holding state-values for second timestep in a batch of transitions, shape `[B]`. name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert( [[q_tm1], [a_tm1, r_t, pcont_t, v_t]], [2, 1], name) # QV op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, v_t]): # Build target and select head to update. with tf.name_scope("target"): target = tf.stop_gradient(r_t + pcont_t * v_t) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t, name="QLearning"): """Implements the Q-learning loss as a TensorFlow op. The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and the target `r_t + pcont_t * max q_t`. See "Reinforcement Learning: An Introduction" by Sutton and Barto. (http://incompleteideas.net/book/ebook/node65.html). Args: q_tm1: Tensor holding Q-values for first timestep in a batch of transitions, shape `[B x num_actions]`. a_tm1: Tensor holding action indices, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. q_t: Tensor holding Q-values for second timestep in a batch of transitions, shape `[B x num_actions]`. name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert( [[q_tm1, q_t], [a_tm1, r_t, pcont_t]], [2, 1], name) # Q-learning op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]): # Build target and select head to update. with tf.name_scope("target"): target = tf.stop_gradient( r_t + pcont_t * tf.reduce_max(q_t, axis=1)) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def qv_max(v_tm1, r_t, pcont_t, q_t, name="QVMAX"): """Implements the QVMAX learning loss as a TensorFlow op. The QVMAX loss is `0.5` times the squared difference between `v_tm1` and the target `r_t + pcont_t * max q_t`, where `q_t` is separately learned through QV learning (c.f. `action_value_ops.qv_learning`). See "The QV Family Compared to Other Reinforcement Learning Algorithms" by Wiering and van Hasselt (2009). (http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.713.1931) Args: v_tm1: Tensor holding values at previous timestep, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. q_t: Tensor of action values at current timestep, shape `[B, num_actions]`. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `v_tm1`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[v_tm1, r_t, pcont_t], [q_t]], [1, 2], name) # The QVMAX op. with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, q_t]): # Build target. target = tf.stop_gradient(r_t + pcont_t * tf.reduce_max(q_t, axis=1)) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - v_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, TDExtra(target, td_error))
def td_learning(v_tm1, r_t, pcont_t, v_t, name="TDLearning"): """Implements the TD(0)-learning loss as a TensorFlow op. The TD loss is `0.5` times the squared difference between `v_tm1` and the target `r_t + pcont_t * v_t`. See "Learning to Predict by the Methods of Temporal Differences" by Sutton. (https://link.springer.com/article/10.1023/A:1022633531479). Args: v_tm1: Tensor holding values at previous timestep, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. v_t: Tensor holding values at current timestep, shape `[B]`. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `v_tm1`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[v_tm1, v_t, r_t, pcont_t]], [1], name) # TD(0)-learning op. with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, v_t]): # Build target. target = tf.stop_gradient(r_t + pcont_t * v_t) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - v_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, TDExtra(target, td_error))
def retrace(lambda_, qs, targnet_qs, actions, rewards, pcontinues, target_policy_probs, behaviour_policy_probs, stop_targnet_gradients=True, name=None): """Retrace algorithm loss calculation op. Given a minibatch of temporally-contiguous sequences of Q values, policy probabilities, and various other typical RL algorithm inputs, this Op creates a subgraph that computes a loss according to the Retrace multi-step off-policy value learning algorithm. This Op supports the use of target networks, but does not require them. For more details of Retrace, refer to [the arXiv paper](http://arxiv.org/abs/1606.02647). In argument descriptions, `T` counts the number of transitions over which the Retrace loss is computed, and `B` is the minibatch size. Note that all tensor arguments list a first-dimension (time dimension) size of T+1; this is because in order to compute the loss over T timesteps, the algorithm must be aware of the values of many of its inputs at timesteps before and after each transition. All tensor arguments are indexed first by transition, with specific details of this indexing in the argument descriptions. Args: lambda_: Positive scalar value or 0-D `Tensor` controlling the degree to which future timesteps contribute to the loss computed at each transition. qs: 3-D tensor holding per-action Q-values for the states encountered just before taking the transitions that correspond to each major index. Since these values are the predicted values we wish to update (in other words, the values we intend to change as we learn), in a target network setting, these nearly always come from the "non-target" network, which we usually call the "learning network". Shape is `[(T+1), B, num_actions]`. targnet_qs: Like `qs`, but in the target network setting, these values should be computed by the target network. We use these values to compute multi-step error values for timesteps that follow the first timesteps in each sequence and sequence fragment we consider. Shape is `[(T+1), B, num_actions]`. actions: 2-D tensor holding the indices of actions executed during the transition that corresponds to each major index. Shape is `[(T+1), B]`. rewards: 2-D tensor holding rewards received during the transition that corresponds to each major index. Shape is `[(T+1), B]`. pcontinues: 2-D tensor holding pcontinue values received during the transition that corresponds to each major index. Shape is `[(T+1), B]`. target_policy_probs: 3-D tensor holding per-action policy probabilities for the states encountered just before taking the transitions that correspond to each major index, according to the target policy (i.e. the policy we wish to learn). These probabilities usually derive from the learning net. Shape is `[(T+1), B, num_actions]`. behaviour_policy_probs: 2-D tensor holding the *behaviour* policy's probabilities of having taken actions `action` during the transitions that correspond to each major index. These probabilities derive from whatever policy you used to generate the data. Shape is `[(T+1), B]`. stop_targnet_gradients: `bool` that enables a sensible default way of handling gradients through the Retrace op (essentially, gradients are not permitted to involve the `targnet_qs` inputs). Can be disabled if you require a different arrangement, but you'll probably want to block some gradients somewhere. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: Tensor containing the batch of losses, shape `[B]`. * `extra`: None """ all_args = [ lambda_, qs, targnet_qs, actions, rewards, pcontinues, target_policy_probs, behaviour_policy_probs ] with tf.name_scope(name, 'Retrace', values=all_args): # Mainly to simplify testing: (lambda_, qs, targnet_qs, actions, rewards, pcontinues, target_policy_probs, behaviour_policy_probs) = (tf.convert_to_tensor(arg) for arg in all_args) # Require correct tensor ranks---as long as we have shape information # available to check. If there isn't any, we print a warning. def check_rank(tensors, ranks): for i, (tensor, rank) in enumerate(zip(tensors, ranks)): if tensor.get_shape(): base_ops.assert_rank_and_shape_compatibility([tensor], rank) else: tf.logging.error( 'Tensor "%s", which was offered as Retrace parameter %d, has ' 'no rank at construction time, so Retrace can\'t verify that ' 'it has the necessary rank of %d', tensor.name, i + 1, rank) check_rank([ lambda_, qs, targnet_qs, actions, rewards, pcontinues, target_policy_probs, behaviour_policy_probs ], [0, 3, 3, 2, 2, 2, 3, 2]) # Deduce the shapes of the arguments we'll create for retrace_core. qs_shape = tf.shape(qs) timesteps = qs_shape[0] # Batch size is qs_shape[1]. # Deduce the time indices for the arguments we'll create for retrace_core. timestep_indices_tm1 = tf.range(0, timesteps - 1) timestep_indices_t = tf.range(1, timesteps) # Construct arguments for retrace_core and call. q_tm1 = tf.gather(qs, timestep_indices_tm1) a_tm1 = tf.gather(actions, timestep_indices_tm1) r_t = tf.gather(rewards, timestep_indices_tm1) pcont_t = tf.gather(pcontinues, timestep_indices_tm1) target_policy_t = tf.gather(target_policy_probs, timestep_indices_t) behaviour_policy_t = tf.gather(behaviour_policy_probs, timestep_indices_t) targnet_q_t = tf.gather(targnet_qs, timestep_indices_t) a_t = tf.gather(actions, timestep_indices_t) core = retrace_core(lambda_, q_tm1, a_tm1, r_t, pcont_t, target_policy_t, behaviour_policy_t, targnet_q_t, a_t, stop_targnet_gradients) return base_ops.LossOutput(core.loss, None)
def retrace_core(lambda_, q_tm1, a_tm1, r_t, pcont_t, target_policy_t, behaviour_policy_t, targnet_q_t, a_t, stop_targnet_gradients=True, name=None): """Retrace algorithm core loss calculation op. Given a minibatch of temporally-contiguous sequences of Q values, policy probabilities, and various other typical RL algorithm inputs, this Op creates a subgraph that computes a loss according to the Retrace multi-step off-policy value learning algorithm. This Op supports the use of target networks, but does not require them. This function is the "core" Retrace op only because its arguments are less user-friendly and more implementation-convenient. For a more user-friendly operator, consider using `retrace`. For more details of Retrace, refer to [the arXiv paper](http://arxiv.org/abs/1606.02647). Construct the "core" retrace loss subgraph for a batch of sequences. Note that two pairs of arguments (one holding target network values; the other, actions) are temporally-offset versions of each other and will share many values in common (nb: a good setting for using `IndexedSlices`). *This op does not include any checks that these pairs of arguments are consistent*---that is, it does not ensure that temporally-offset arguments really do share the values they are supposed to share. In argument descriptions, `T` counts the number of transitions over which the Retrace loss is computed, and `B` is the minibatch size. All tensor arguments are indexed first by transition, with specific details of this indexing in the argument descriptions (pay close attention to "subscripts" in variable names). Args: lambda_: Positive scalar value or 0-D `Tensor` controlling the degree to which future timesteps contribute to the loss computed at each transition. q_tm1: 3-D tensor holding per-action Q-values for the states encountered just before taking the transitions that correspond to each major index. Since these values are the predicted values we wish to update (in other words, the values we intend to change as we learn), in a target network setting, these nearly always come from the "non-target" network, which we usually call the "learning network". Shape is `[T, B, num_actions]`. a_tm1: 2-D tensor holding the indices of actions executed during the transition that corresponds to each major index. Shape is `[T, B]`. r_t: 2-D tensor holding rewards received during the transition that corresponds to each major index. Shape is `[T, B]`. pcont_t: 2-D tensor holding pcontinue values received during the transition that corresponds to each major index. Shape is `[T, B]`. target_policy_t: 3-D tensor holding per-action policy probabilities for the states encountered just AFTER the transitions that correspond to each major index, according to the target policy (i.e. the policy we wish to learn). These usually derive from the learning net. Shape is `[T, B, num_actions]`. behaviour_policy_t: 2-D tensor holding the *behaviour* policy's probabilities of having taken action `a_t` at the states encountered just AFTER the transitions that correspond to each major index. Derived from whatever policy you used to generate the data. All values MUST be greater that 0. Shape is `[T, B]`. targnet_q_t: 3-D tensor holding per-action Q-values for the states encountered just AFTER taking the transitions that correspond to each major index. Since these values are used to calculate target values for the network, in a target in a target network setting, these should probably come from the target network. Shape is `[T, B, num_actions]`. a_t: 2-D tensor holding the indices of actions executed during the transition AFTER the transition that corresponds to each major index. Shape is `[T, B]`. stop_targnet_gradients: `bool` that enables a sensible default way of handling gradients through the Retrace op (essentially, gradients are not permitted to involve the `targnet_q_t` input). Can be disabled if you require a different arragement, but you'll probably want to block some gradients somewhere. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: Tensor containing the batch of losses, shape `[B]`. * `extra`: A namedtuple with fields: * `retrace_weights`: Tensor containing batch of retrace weights, shape `[T, B]`. * `target`: Tensor containing target action values, shape `[T, B]`. """ all_args = [ lambda_, q_tm1, a_tm1, r_t, pcont_t, target_policy_t, behaviour_policy_t, targnet_q_t, a_t ] with tf.name_scope(name, 'RetraceCore', all_args): (lambda_, q_tm1, a_tm1, r_t, pcont_t, target_policy_t, behaviour_policy_t, targnet_q_t, a_t) = (tf.convert_to_tensor(arg) for arg in all_args) # Evaluate importance weights. c_t = _retrace_weights( indexing_ops.batched_index(target_policy_t, a_t), behaviour_policy_t) * lambda_ # Targets are evaluated by using only Q values from the target network. # This provides fixed regression targets until the next target network # update. target = _general_off_policy_corrected_multistep_target( r_t, pcont_t, target_policy_t, c_t, targnet_q_t, a_t, not stop_targnet_gradients) if stop_targnet_gradients: target = tf.stop_gradient(target) # Regress Q values of the learning network towards the targets evaluated # by using the target network. qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) delta = target - qa_tm1 loss = 0.5 * tf.square(delta) return base_ops.LossOutput( loss, RetraceCoreExtra(retrace_weights=c_t, target=target))
def categorical_dist_qlearning(atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t, atoms_t, logits_q_t, name="CategoricalDistQLearning"): """Implements Distributional Q-learning as TensorFlow ops. The function assumes categorical value distributions parameterized by logits. See "A Distributional Perspective on Reinforcement Learning" by Bellemare, Dabney and Munos. (https://arxiv.org/abs/1707.06887). Args: atoms_tm1: 1-D tensor containing atom values for first timestep, shape `[num_atoms]`. logits_q_tm1: Tensor holding logits for first timestep in a batch of transitions, shape `[B, num_actions, num_atoms]`. a_tm1: Tensor holding action indices, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. atoms_t: 1-D tensor containing atom values for second timestep, shape `[num_atoms]`. logits_q_t: Tensor holding logits for second timestep in a batch of transitions, shape `[B, num_actions, num_atoms]`. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: a tensor containing the values that `q_tm1` at actions `a_tm1` are regressed towards, shape `[B, num_atoms]`. Raises: ValueError: If the tensors do not have the correct rank or compatibility. """ # Rank and compatibility checks. assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t], [atoms_tm1, atoms_t]] base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1], name) # Categorical distributional Q-learning op. with tf.name_scope(name, values=[ atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t, atoms_t, logits_q_t ]): with tf.name_scope("target"): # Scale and shift time-t distribution atoms by discount and reward. target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :] # Convert logits to distribution, then find greedy action in state s_t. q_t_probs = tf.nn.softmax(logits_q_t) q_t_mean = tf.reduce_sum(q_t_probs * atoms_t, 2) pi_t = tf.argmax(q_t_mean, 1, output_type=tf.int32) # Compute distribution for greedy action. p_target_z = _slice_with_actions(q_t_probs, pi_t) # Project using the Cramer distance target = tf.stop_gradient( _l2_project(target_z, p_target_z, atoms_tm1)) logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logit_qa_tm1, labels=target) return base_ops.LossOutput(loss, Extra(target))
def sequence_advantage_actor_critic_loss( policy_logits, baseline_values, actions, rewards, pcontinues, bootstrap_value, lambda_=1, entropy_cost=None, baseline_cost=1, normalise_entropy=False, name="SequenceAdvantageActorCriticLoss"): """Calculates the loss for an A2C update along a batch of trajectories. Technically A2C is the special case where lambda=1; for general lambda this is the loss for Generalized Advantage Estimation (GAE), modulo chunking behaviour if passing chunks of episodes (see `generalized_lambda_returns` for more detail). Note: This function takes policy _logits_ as input, not the log-policy like `learning.deepmind.lua.rl.learners.Reinforce` does. This loss jointly learns the policy and the baseline. Therefore, gradients for this loss flow through each tensor in `policy_logits` and `baseline_values`, but no other input tensors. The policy is learnt with the advantage actor-critic loss, plus an optional entropy term. The baseline is regressed towards the n-step bootstrapped returns given by the reward/pcontinue sequence. The `baseline_cost` parameter scales the gradients w.r.t the baseline relative to the policy gradient. i.e: `d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`. `rewards` and `pcontinues` are the sequences of data taken directly from the environment, possibly modulated by a discount. `baseline_values` are the sequences of (typically learnt) estimates of the values of the states visited along a batch of trajectories as observed by the agent given the sequences of one or more actions sampled from the `policy_logits`. The sequences in the tensors should be aligned such that an agent in a state with value `V` that takes an action `a` transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. This function accepts a nested array of `policy_logits` and `actions` in order to allow for multidimensional discrete action spaces. In this case, the loss is given by `sum_i(loss(p_i, a_i))` where `p_i` are members of the `policy_logits` nest, and `a_i` are members of the `actions` nest. We assume that a single baseline is used across all action dimensions for each timestep. Args: policy_logits: A (possibly nested structure of) 3-D Tensor(s) with shape `[T, B, num_actions]` and possibly different dimension `num_actions`. baseline_values: 2-D Tensor containing an estimate of state values `[T, B]`. actions: A (possibly nested structure of) 2-D Tensor(s) with shape `[T, B]` and integer type. rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. bootstrap_value: 1-D Tensor with shape `[B]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for Generalised Advantage Estimation as per https://arxiv.org/abs/1506.02438. entropy_cost: optional scalar cost that pushes the policy to have high entropy, larger values cause higher entropies. baseline_cost: scalar cost that scales the derivatives of the baseline relative to the policy gradient. normalise_entropy: if True, the entropy loss is normalised to the range `[-1, 0]` by dividing by the log number of actions. This makes it more invariant to the size of the action space. Default is False. name: Customises the name_scope for this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the total loss, shape `[B]`. * `extra`: a namedtuple with fields: * `entropy`: total loss per sequence, shape `[B]`. * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`. * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`. * `policy_gradient_loss`: policy gradient loss per sequence, shape `[B]`. * `advantages`: advantange estimates per timestep, shape `[T, B]`. * `discounted_returns`: discounted returns per timestep, shape `[T, B]`. """ scoped_values = (nest.flatten(policy_logits) + nest.flatten(actions) + [baseline_values, rewards, pcontinues, bootstrap_value]) with tf.name_scope(name, values=scoped_values): # Loss for the baseline, summed over the time dimension. baseline_loss_td, td_lambda = value_ops.td_lambda( baseline_values, rewards, pcontinues, bootstrap_value, lambda_) # The TD error provides an estimate of the advantages of the actions. advantages = td_lambda.temporal_differences baseline_loss = tf.multiply( tf.convert_to_tensor(baseline_cost, dtype=tf.float32), baseline_loss_td, name="baseline_loss") # Loss for the policy. Doesn't push additional gradients through # the advantages. policy_gradient_loss = discrete_policy_gradient_loss( policy_logits, actions, advantages, name="policy_gradient_loss") total_loss = tf.add(policy_gradient_loss, baseline_loss, name="total_loss") if entropy_cost is not None: entropy_loss_op, policy_entropy = discrete_policy_entropy_loss( policy_logits, normalise=normalise_entropy) # [T,B]. entropy = tf.reduce_sum( policy_entropy.entropy, axis=0, name="entropy") # [B]. entropy_loss = tf.multiply( tf.convert_to_tensor(entropy_cost, dtype=tf.float32), tf.reduce_sum(entropy_loss_op, axis=0), name="scaled_entropy_loss") # [B]. total_loss = tf.add(total_loss, entropy_loss, name="total_loss_with_entropy") else: entropy = None entropy_loss = None extra = SequenceAdvantageActorCriticExtra( entropy=entropy, entropy_loss=entropy_loss, baseline_loss=baseline_loss, policy_gradient_loss=policy_gradient_loss, advantages=advantages, discounted_returns=td_lambda.discounted_returns) return base_ops.LossOutput(total_loss, extra)
def pixel_control_loss( observations, actions, action_values, cell_size, discount_factor, scale, crop_height_dim=(None, None), crop_width_dim=(None, None)): """Calculate n-step Q-learning loss for pixel control auxiliary task. For each pixel-based pseudo reward signal, the corresponding action-value function is trained off-policy, using Q(lambda). A discount of 0.9 is commonly used for learning the value functions. Note that, since pseudo rewards have a spatial structure, with neighbouring cells exhibiting strong correlations, it is convenient to predict the action values for all the cells through a deconvolutional head. See "Reinforcement Learning with Unsupervised Auxiliary Tasks" by Jaderberg, Mnih, Czarnecki et al. (https://arxiv.org/abs/1611.05397). Args: observations: A tensor of shape `[T+1,B, ...]`; `...` is the observation shape, `T` the sequence length, and `B` the batch size. `T` and `B` can be statically unknown for `observations`, `actions` and `action_values`. actions: A tensor, shape `[T,B]`, of the actions across each sequence. action_values: A tensor, shape `[T+1,B,H,W,N]` of pixel control action values, where `H`, `W` are the number of pixel control cells/tasks, and `N` is the number of actions. cell_size: size of the cells used to derive the pixel based pseudo-rewards. discount_factor: discount used for learning the value function associated to the pseudo rewards; must be a scalar or a Tensor of shape [T,B]. scale: scale factor for pixels in `observations`. crop_height_dim: tuple (min_height, max_height) specifying how to crop the input observations before computing the pseudo-rewards. crop_width_dim: tuple (min_width, max_width) specifying how to crop the input observations before computing the pseudo-rewards. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape [B]. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B]. * `td_error`: batch of temporal difference errors, shape [B]. Raises: ValueError: if the shape of `action_values` is not compatible with that of the pseudo-rewards derived from the observations. """ # Useful shapes. sequence_length, batch_size = base_ops.best_effort_shape(actions) num_actions = action_values.get_shape().as_list()[-1] height_width_q = action_values.get_shape().as_list()[2:-1] # Calculate rewards using the observations. Crop observations if appropriate. if crop_height_dim[0] is not None: h_low, h_high = crop_height_dim observations = observations[:, :, h_low:h_high, :] if crop_width_dim[0] is not None: w_low, w_high = crop_width_dim observations = observations[:, :, :, w_low:w_high] # Rescale observations by a constant factor. observations *= tf.constant(scale) # Compute pseudo-rewards and get their shape. pseudo_rewards = pixel_control_rewards(observations, cell_size) height_width = pseudo_rewards.get_shape().as_list()[2:] # Check that pseudo-rewards and Q-values are compatible in shape. if height_width != height_width_q: raise ValueError( "Pixel Control values are not compatible with the shape of the" "pseudo-rewards derived from the observation. Pseudo-rewards have shape" "{}, while Pixel Control values have shape {}".format( height_width, height_width_q)) # We now have Q(s,a) and rewards, so can calculate the n-step loss. The # QLambda loss op expects inputs of shape [T,B,N] and [T,B], but our tensors # are in a variety of incompatible shapes. The state-action values have # shape [T,B,H,W,N] and rewards have shape [T,B,H,W]. We can think of the # [H,W] dimensions as extra batch dimensions for the purposes of the loss # calculation, so we first collapse [B,H,W] into a single dimension. q_tm1 = tf.reshape( action_values[:-1], # [T,B,H,W,N]. [sequence_length, -1, num_actions], name="q_tm1") # [T,BHW,N]. r_t = tf.reshape( pseudo_rewards, # [T,B,H,W]. [sequence_length, -1], name="r_t") # [T,BHW]. q_t = tf.reshape( action_values[1:], # [T,B,H,W,N]. [sequence_length, -1, num_actions], name="q_t") # [T,BHW,N]. # The actions tensor is of shape [T,B], and is the same for each H and W. # We thus expand it to be same shape as the reward tensor, [T,BHW]. expanded_actions = tf.expand_dims(tf.expand_dims(actions, -1), -1) a_tm1 = tf.tile( expanded_actions, multiples=[1, 1] + height_width) # [T,B,H,W]. a_tm1 = tf.reshape(a_tm1, [sequence_length, -1]) # [T,BHW]. # We similarly expand-and-tile the discount to [T,BHW]. discount_factor = tf.convert_to_tensor(discount_factor) if discount_factor.shape.ndims == 0: pcont_t = tf.reshape(discount_factor, [1, 1]) # [1,1]. pcont_t = tf.tile(pcont_t, tf.shape(a_tm1)) # [T,BHW]. elif discount_factor.shape.ndims == 2: tiled_pcont = tf.tile( tf.expand_dims(tf.expand_dims(discount_factor, -1), -1), [1, 1] + height_width) pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1]) else: raise ValueError( "The discount_factor must be a scalar or a tensor of rank 2." "instead is a tensor of shape {}".format( discount_factor.shape.as_list())) # Compute a QLambda loss of shape [T,BHW] loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1) # Take sum over sequence, sum over cells. expanded_shape = [sequence_length, batch_size] + height_width spatial_loss = tf.reshape(loss, expanded_shape) # [T,B,H,W]. # Return. extra = PixelControlExtra( spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards) return base_ops.LossOutput( tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra) # [B]
def td_lambda(state_values, rewards, pcontinues, bootstrap_value, lambda_=1, name="BaselineLoss"): """Constructs a TensorFlow graph computing the L2 loss for sequences. This loss learns the baseline for advantage actor-critic models. Gradients for this loss flow through each tensor in `state_values`, but no other input tensors. The baseline is regressed towards the n-step bootstrapped returns given by the reward/pcontinue sequence. This function is designed for batches of sequences of data. Tensors are assumed to be time major (i.e. the outermost dimension is time, the second outermost dimension is the batch dimension). We denote the sequence length in the shapes of the arguments with the variable `T`, the batch size with the variable `B`, neither of which needs to be known at construction time. Index `0` of the time dimension is assumed to be the start of the sequence. `rewards` and `pcontinues` are the sequences of data taken directly from the environment, possibly modulated by a discount. `state_values` are the sequences of (typically learnt) estimates of the values of the states visited along a batch of trajectories. The sequences in the tensors should be aligned such that an agent in a state with value `V` that takes an action transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. See "High-dimensional continuous control using generalized advantage estimation" by Schulman, Moritz, Levine et al. (https://arxiv.org/abs/1506.02438). Args: state_values: 2-D Tensor of state-value estimates with shape `[T, B]`. rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. bootstrap_value: 1-D Tensor with shape `[B]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`. name: Customises the name_scope for this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * temporal_differences, Tensor of shape `[T, B]` * discounted_returns, Tensor of shape `[T, B]` """ scoped_values = [state_values, rewards, pcontinues, bootstrap_value] with tf.name_scope(name, values=scoped_values): discounted_returns = generalized_lambda_returns( rewards, pcontinues, state_values, bootstrap_value, lambda_) temporal_differences = discounted_returns - state_values loss = 0.5 * tf.reduce_sum( tf.square(temporal_differences), axis=0, name="l2_loss") return base_ops.LossOutput( loss, TDLambdaExtra(temporal_differences=temporal_differences, discounted_returns=discounted_returns))
def sequence_a2c_loss(policies, baseline_values, actions, rewards, pcontinues, bootstrap_value, policy_vars=None, lambda_=1, entropy_cost=None, baseline_cost=1, entropy_scale_op=None, name="SequenceA2CLoss"): """Constructs a TensorFlow graph computing the A2C/GAE loss for sequences. This loss jointly learns the policy and the baseline. Therefore, gradients for this loss flow through each tensor in `policies` and through each tensor in `baseline_values`, but no other input tensors. The policy is learnt with the advantage actor-critic loss, plus an optional entropy term. The baseline is regressed towards the n-step bootstrapped returns given by the reward/pcontinue sequence. The `baseline_cost` parameter scales the gradients w.r.t the baseline relative to the policy gradient, i.e. d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`. This function is designed for batches of sequences of data. Tensors are assumed to be time major (i.e. the outermost dimension is time, the second outermost dimension is the batch dimension). We denote the sequence length in the shapes of the arguments with the variable `T`, the batch size with the variable `B`, neither of which needs to be known at construction time. Index `0` of the time dimension is assumed to be the start of the sequence. `rewards` and `pcontinues` are the sequences of data taken directly from the environment, possibly modulated by a discount. `baseline_values` are the sequences of (typically learnt) estimates of the values of the states visited along a batch of trajectories as observed by the agent given the sequences of one or more actions sampled from `policies`. The sequences in the tensors should be aligned such that an agent in a state with value `V` that takes an action `a` transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. For n-dimensional action vectors, a multivariate distribution must be used for `policies`. In case there is no multivariate version for the desired univariate distribution, or in case the `actions` object is a nested structure (e.g. for multiple action types), this function also accepts a nested structure of `policies`. In this case, the loss is given by `sum_i(loss(p_i, a_i))` where `p_i` are members of the `policies` nest, and `a_i` are members of the `actions` nest. We assume that a single baseline is used across all action dimensions for each timestep. Args: policies: A (possibly nested structure of) distribution(s) supporting `batch_shape` and `event_shape` properties & `log_prob` and `entropy` methods (e.g. an instance of `tfp.distributions.Distribution`), with `batch_shape` equal to `[T, B]`. E.g. for a (non-nested) diagonal multivariate gaussian with dimension `A` this would be: `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)` where `mus` and `sigmas` have shape `[T, B, A]`. baseline_values: 2-D Tensor containing an estimate of the state value with shape `[T, B]`. actions: A (possibly nested structure of) N-D Tensor(s) with shape `[T, B, ...]` where the final dimensions are the `event_shape` of the corresponding distribution in the nested structure (the shape can be just `[T, B]` if the `event_shape` is scalar). rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. bootstrap_value: 1-D Tensor with shape `[B]`. policy_vars: An optional (possibly nested structure of) iterables of Tensors used by `policies`. If provided is used in scope checks. For the multivariate normal example above this would be `[mus, sigmas]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for Generalised Advantage Estimation as per https://arxiv.org/abs/1506.02438. entropy_cost: optional scalar cost that pushes the policy to have high entropy, larger values cause higher entropies. baseline_cost: scalar cost that scales the derivatives of the baseline relative to the policy gradient. entropy_scale_op: An optional op that takes `policies` as its only argument and returns a scalar Tensor that is used to scale the entropy loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of dimensions makes entropy loss invariant to the action space dimension. See `policy_entropy_loss` for more info. name: Customises the name_scope for this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the total loss, shape `[B]`. * `extra`: a namedtuple with fields: * `entropy`: total loss per sequence, shape `[B]`. * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`. * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`. * `policy_gradient_loss`: policy gradient loss per sequence, shape `[B]`. * `advantages`: advantange estimates per timestep, shape `[T, B]`. * `discounted_returns`: discounted returns per timestep, shape `[T, B]`. """ flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list() scoped_values = (flat_policy_vars + nest.flatten(actions) + [baseline_values, rewards, pcontinues, bootstrap_value]) with tf.name_scope(name, values=scoped_values): # Loss for the baseline, summed over the time dimension. baseline_loss_td, td_lambda = value_ops.td_lambda( baseline_values, rewards, pcontinues, bootstrap_value, lambda_) # The TD error provides an estimate of the advantages of the actions. advantages = td_lambda.temporal_differences baseline_loss = tf.multiply(tf.convert_to_tensor(baseline_cost, dtype=tf.float32), baseline_loss_td, name="baseline_loss") # Loss for the policy. Doesn't push additional gradients through # the advantages. pg_loss = policy_gradient_loss(policies, actions, advantages, policy_vars, name="policy_gradient_loss") total_loss = tf.add(pg_loss, baseline_loss, name="total_loss") if entropy_cost is not None: loss, extra = policy_entropy_loss(policies, policy_vars, entropy_scale_op) entropy = tf.reduce_sum(extra.entropy, axis=0, name="entropy") # [B]. entropy_loss = tf.multiply(tf.convert_to_tensor(entropy_cost, dtype=tf.float32), tf.reduce_sum(loss, axis=0), name="scaled_entropy_loss") # [B]. total_loss = tf.add(total_loss, entropy_loss, name="total_loss_with_entropy") else: entropy = None entropy_loss = None extra = SequenceA2CExtra( entropy=entropy, entropy_loss=entropy_loss, baseline_loss=baseline_loss, policy_gradient_loss=pg_loss, advantages=advantages, discounted_returns=td_lambda.discounted_returns) return base_ops.LossOutput(total_loss, extra)
def policy_entropy_loss(policies, policy_vars=None, scale_op=None, name="policy_entropy_loss"): """Calculates entropy 'loss' for policies represented by a distributions. Given a (possible nested structure of) batch(es) of policies, this calculates the total entropy and corrects the sign so that minimizing the resulting loss op is equivalent to increasing entropy in the batch. This function accepts a nested structure of `policies` in order to allow for multiple distribution types or for multiple action dimensions in the case where there is no corresponding mutivariate form for available for a given univariate distribution. In this case, the loss is `sum_i(H(p_i, p_i))` where `p_i` are members of the `policies` nest. It can be shown that this is equivalent to calculating the entropy loss on the Cartesian product space over all the action dimensions, if the sampled actions are independent. The entropy loss is optionally scaled by some function of the policies. E.g. for Categorical distributions there exists such a scaling which maps the entropy loss into the range `[-1, 0]` in order to make it invariant to the size of the action space - specifically one can divide the loss by `sum_i(log(A_i))` where `A_i` is the number of categories in the i'th Categorical distribution in the `policies` nest). Args: policies: A (possibly nested structure of) batch distribution(s) supporting an `entropy` method that returns an N-D Tensor with shape equal to the `batch_shape` of the distribution, e.g. an instance of `tfp.distributions.Distribution`. policy_vars: An optional (possibly nested structure of) iterable(s) of Tensors used by `policies`. If provided is used in scope checks. scale_op: An optional op that takes `policies` as its only argument and returns a scalar Tensor that is used to scale the entropy loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of dimensions makes entropy loss invariant to the action space dimension. name: Optional, name of this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B1, B2, ...]`. * `extra`: a namedtuple with fields: * `entropy`: entropy of the policy, shape `[B1, B2, ...]`. where [B1, B2, ... ] == policy.batch_shape """ flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list() with tf.name_scope(name, values=flat_policy_vars): # We want a value that we can minimize along with other losses, and where # minimizing means driving the policy towards a uniform distribution over # the actions. We thus scale it by negative one so that it can be simply # added to other losses. scale = tf.constant(-1.0, dtype=tf.float32) if scale_op: scale *= scale_op(policies) policies = nest.flatten(policies) entropy = tf.add_n([policy.entropy() for policy in policies], name="entropy") loss = tf.multiply(scale, entropy, name="entropy_loss") return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy))
def categorical_dist_double_qlearning(atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t, atoms_t, logits_q_t, q_t_selector, name="CategoricalDistDoubleQLearning"): """Implements Distributional Double Q-learning as TensorFlow ops. The function assumes categorical value distributions parameterized by logits, and combines distributional RL with double Q-learning. See "Rainbow: Combining Improvements in Deep Reinforcement Learning" by Hessel, Modayil, van Hasselt, Schaul et al. (https://arxiv.org/abs/1710.02298). Args: atoms_tm1: 1-D tensor containing atom values for first timestep, shape `[num_atoms]`. logits_q_tm1: Tensor holding logits for first timestep in a batch of transitions, shape `[B, num_actions, num_atoms]`. a_tm1: Tensor holding action indices, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. atoms_t: 1-D tensor containing atom values for second timestep, shape `[num_atoms]`. logits_q_t: Tensor holding logits for second timestep in a batch of transitions, shape `[B, num_actions, num_atoms]`. q_t_selector: Tensor holding another set of Q-values for second timestep in a batch of transitions, shape `[B, num_actions]`. These values are used for estimating the best action. In Double DQN they come from the online network. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: Tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: Tensor containing the values that `q_tm1` at actions `a_tm1` are regressed towards, shape `[B, num_atoms]` . Raises: ValueError: If the tensors do not have the correct rank or compatibility. """ # Rank and compatibility checks. assertion_lists = [[logits_q_tm1, logits_q_t], [a_tm1, r_t, pcont_t], [atoms_tm1, atoms_t], [q_t_selector]] base_ops.wrap_rank_shape_assert(assertion_lists, [3, 1, 1, 2], name) # Categorical distributional double Q-learning op. with tf.name_scope(name, values=[ atoms_tm1, logits_q_tm1, a_tm1, r_t, pcont_t, atoms_t, logits_q_t, q_t_selector ]): with tf.name_scope("target"): # Scale and shift time-t distribution atoms by discount and reward. target_z = r_t[:, None] + pcont_t[:, None] * atoms_t[None, :] # Convert logits to distribution, then find greedy policy action in # state s_t. q_t_probs = tf.nn.softmax(logits_q_t) pi_t = tf.argmax(q_t_selector, 1, output_type=tf.int32) # Compute distribution for greedy action. p_target_z = _slice_with_actions(q_t_probs, pi_t) # Project using the Cramer distance target = tf.stop_gradient( _l2_project(target_z, p_target_z, atoms_tm1)) logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logit_qa_tm1, labels=target) return base_ops.LossOutput(loss, Extra(target))
def sarse( q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t, debug=False, name="Sarse"): """Implements the SARSE (Expected SARSA) loss as a TensorFlow op. The loss is `0.5` times the squared difference between `q_tm1[a_tm1]` and the target `r_t + pcont_t * (sum_a probs_a_t[a] * q_t[a])`. See "A Theoretical and Empirical Analysis of Expected Sarsa" by Seijen, van Hasselt, Whiteson et al. (http://www.cs.ox.ac.uk/people/shimon.whiteson/pubs/vanseijenadprl09.pdf). Args: q_tm1: Tensor holding Q-values for first timestep in a batch of transitions, shape `[B x num_actions]`. a_tm1: Tensor holding action indices, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. q_t: Tensor holding Q-values for second timestep in a batch of transitions, shape `[B x num_actions]`. probs_a_t: Tensor holding action probabilities for second timestep, shape `[B x num_actions]`. debug: Boolean flag, when set to True adds ops to check whether probs_a_t is a batch of (approximately) valid probability distributions. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert( [[q_tm1, q_t, probs_a_t], [a_tm1, r_t, pcont_t]], [2, 1], name) # SARSE (Expected SARSA) op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, probs_a_t]): # Debug ops. deps = [] if debug: cumulative_prob = tf.reduce_sum(probs_a_t, axis=1) almost_prob = tf.less(tf.abs(tf.subtract(cumulative_prob, 1.0)), 1e-6) deps.append(tf.Assert( tf.reduce_all(almost_prob), ["probs_a_t tensor does not sum to 1", probs_a_t])) # With dependency on possible debug ops. with tf.control_dependencies(deps): # Select head to update and build target. qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) target = tf.stop_gradient( r_t + pcont_t * tf.reduce_sum(tf.multiply(q_t, probs_a_t), axis=1)) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def qlambda( q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"): """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op. This function is general enough to implement both Peng's and Watkins' Q-lambda algorithms. See "Reinforcement Learning: An Introduction" by Sutton and Barto. (http://incompleteideas.net/book/ebook/node78.html). Args: q_tm1: `Tensor` holding a sequence of Q-values starting at the first timestep; shape `[T, B, num_actions]` a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]` r_t: Tensor holding a sequence of rewards, shape `[T, B]` pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]` q_t: `Tensor` holding a sequence of Q-values for second timestep; shape `[T, B, num_actions]`. In a target network setting, this quantity is often supplied by the target network. lambda_: a scalar or `Tensor` of shape `[T, B]` specifying the ratio of mixing between bootstrapped and MC returns; if lambda_ is the same for all time steps then the function implements Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action and a constant otherwise, then the function implements Watkins' Q-learning algorithm. Generally lambda_ can be a Tensor of any values in the range [0, 1] supplied by the user. name: a name of the op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[T, B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`. * `td_error`: batch of temporal difference errors, shape `[T, B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name) if isinstance( lambda_, tf.Tensor ) and lambda_.get_shape().ndims is not None and lambda_.get_shape().ndims > 0: base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name) else: base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name) # QLambda op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]): # Build target and select head to update. with tf.name_scope("target"): state_values = tf.reduce_max(q_t, axis=2) target = sequence_ops.multistep_forward_view( r_t, pcont_t, state_values, lambda_, back_prop=False) target = tf.stop_gradient(target) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))