Python td_lambdaの例

プログラミング言語: Python

名前空間/パッケージ名: trfl.value_ops

メソッド/関数: td_lambda

hotexamples.comのコード掲載数: 5

Python td_lambda - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtrfl.value_ops.td_lambdaの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: value_ops_test.py プロジェクト: zhuanglineu/trfl

 def _setUp_td_loss(self, gae_lambda=1, sequence_length=4, batch_size=2):
     t, b = sequence_length, batch_size
     self._state_values = tf.placeholder(tf.float32, shape=(t, b))
     self._rewards = tf.placeholder(tf.float32, shape=(t, b))
     self._pcontinues = tf.placeholder(tf.float32, shape=(t, b))
     self._bootstrap_value = tf.placeholder(tf.float32, shape=(b, ))
     loss, (td, discounted_returns) = value_ops.td_lambda(
         state_values=self._state_values,
         rewards=self._rewards,
         pcontinues=self._pcontinues,
         bootstrap_value=self._bootstrap_value,
         lambda_=gae_lambda)
     self._loss = loss
     self._temporal_differences = td
     self._discounted_returns = discounted_returns

コード例 #2

ファイルを表示

ファイル: discrete_policy_gradient_ops.py プロジェクト: wmiao1769/trfl

def sequence_advantage_actor_critic_loss(
    policy_logits, baseline_values, actions, rewards,
    pcontinues, bootstrap_value, lambda_=1, entropy_cost=None,
    baseline_cost=1, normalise_entropy=False,
    name="SequenceAdvantageActorCriticLoss"):
  """Calculates the loss for an A2C update along a batch of trajectories.

  Technically A2C is the special case where lambda=1; for general lambda
  this is the loss for Generalized Advantage Estimation (GAE), modulo chunking
  behaviour if passing chunks of episodes (see `generalized_lambda_returns` for
  more detail).

  Note: This function takes policy _logits_ as input, not the log-policy like
  `learning.deepmind.lua.rl.learners.Reinforce` does.

  This loss jointly learns the policy and the baseline. Therefore, gradients
  for this loss flow through each tensor in `policy_logits` and
  `baseline_values`, but no other input tensors. The policy is learnt with the
  advantage actor-critic loss, plus an optional entropy term. The baseline is
  regressed towards the n-step bootstrapped returns given by the
  reward/pcontinue sequence.  The `baseline_cost` parameter scales the
  gradients w.r.t the baseline relative to the policy gradient. i.e:
  `d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `baseline_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories as observed by the agent given the
  sequences of one or more actions sampled from the `policy_logits`.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action `a` transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  This function accepts a nested array of `policy_logits` and `actions` in order
  to allow for multidimensional discrete action spaces. In this case, the loss
  is given by `sum_i(loss(p_i, a_i))` where `p_i` are members of the
  `policy_logits` nest, and `a_i` are members of the `actions` nest.
  We assume that a single baseline is used across all action dimensions for
  each timestep.

  Args:
    policy_logits: A (possibly nested structure of) 3-D Tensor(s) with shape
        `[T, B, num_actions]` and possibly non-identical values
        of `num_actions`.
    baseline_values: 2-D Tensor containing an estimate of the state value
        `[T, B]`.
    actions: A (possibly nested structure of) 2-D Tensor(s) with shape
        `[T, B]` and integer type.
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for
        Generalised Advantage Estimation as per
        https://arxiv.org/abs/1506.02438.
    entropy_cost: optional scalar cost that pushes the policy to have high
        entropy, larger values cause higher entropies.
    baseline_cost: scalar cost that scales the derivatives of the baseline
        relative to the policy gradient.
    normalise_entropy: if True, the entropy loss is normalised to the range
        `[-1, 0]` by dividing by the log number of actions. This makes it more
        invariant to the size of the action space. Default is False.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
  scoped_values = (nest.flatten(policy_logits) + nest.flatten(actions) +
                   [baseline_values, rewards, pcontinues, bootstrap_value])
  with tf.name_scope(name, values=scoped_values):
    # Loss for the baseline, summed over the time dimension.
    baseline_loss_td, td_lambda = value_ops.td_lambda(
        baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

    # The TD error provides an estimate of the advantages of the actions.
    advantages = td_lambda.temporal_differences
    baseline_loss = tf.multiply(
        tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
        baseline_loss_td,
        name="baseline_loss")

    # Loss for the policy. Doesn't push additional gradients through
    # the advantages.
    policy_gradient_loss = discrete_policy_gradient_loss(
        policy_logits, actions, advantages, name="policy_gradient_loss")

    total_loss = tf.add(policy_gradient_loss, baseline_loss, name="total_loss")

    if entropy_cost is not None:
      entropy_loss_op, policy_entropy = discrete_policy_entropy_loss(
          policy_logits, normalise=normalise_entropy)  # [T,B].
      entropy = tf.reduce_sum(
          policy_entropy.entropy, axis=0, name="entropy")  # [B].
      entropy_loss = tf.multiply(
          tf.convert_to_tensor(entropy_cost, dtype=tf.float32),
          tf.reduce_sum(entropy_loss_op, axis=0),
          name="scaled_entropy_loss")  # [B].
      total_loss = tf.add(total_loss, entropy_loss,
                          name="total_loss_with_entropy")
    else:
      entropy = None
      entropy_loss = None

    extra = SequenceAdvantageActorCriticExtra(
        entropy=entropy, entropy_loss=entropy_loss,
        baseline_loss=baseline_loss,
        policy_gradient_loss=policy_gradient_loss,
        advantages=advantages,
        discounted_returns=td_lambda.discounted_returns)

    return base_ops.LossOutput(total_loss, extra)

コード例 #3

ファイルを表示

ファイル: policy_gradient_ops.py プロジェクト: DailyActie/AI_RL_FM-trfl

def sequence_a2c_loss(policies,
                      baseline_values,
                      actions,
                      rewards,
                      pcontinues,
                      bootstrap_value,
                      policy_vars=None,
                      lambda_=1,
                      entropy_cost=None,
                      baseline_cost=1,
                      entropy_scale_op=None,
                      name="SequenceA2CLoss"):
    """Constructs a TensorFlow graph computing the A2C/GAE loss for sequences.

  This loss jointly learns the policy and the baseline. Therefore, gradients
  for this loss flow through each tensor in `policies` and through each tensor
  in `baseline_values`, but no other input tensors. The policy is learnt with
  the advantage actor-critic loss, plus an optional entropy term. The baseline
  is regressed towards the n-step bootstrapped returns given by the
  reward/pcontinue sequence. The `baseline_cost` parameter scales the
  gradients w.r.t the baseline relative to the policy gradient, i.e.
  d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`.

  This function is designed for batches of sequences of data. Tensors are
  assumed to be time major (i.e. the outermost dimension is time, the second
  outermost dimension is the batch dimension). We denote the sequence length in
  the shapes of the arguments with the variable `T`, the batch size with the
  variable `B`, neither of which needs to be known at construction time. Index
  `0` of the time dimension is assumed to be the start of the sequence.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `baseline_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories as observed by the agent given the
  sequences of one or more actions sampled from `policies`.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action `a` transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  For n-dimensional action vectors, a multivariate distribution must be used
  for `policies`. In case there is no multivariate version for the desired
  univariate distribution, or in case the `actions` object is a nested
  structure (e.g. for multiple action types), this function also accepts a
  nested structure  of `policies`. In this case, the loss is given by
  `sum_i(loss(p_i, a_i))` where `p_i` are members of the `policies` nest, and
  `a_i` are members of the `actions` nest. We assume that a single baseline is
  used across all action dimensions for each timestep.

  Args:
    policies: A (possibly nested structure of) distribution(s) supporting
        `batch_shape` and `event_shape` properties & `log_prob` and `entropy`
        methods (e.g. an instance of `tfp.distributions.Distribution`),
        with `batch_shape` equal to `[T, B]`. E.g. for a (non-nested) diagonal
        multivariate gaussian with dimension `A` this would be:
        `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)`
        where `mus` and `sigmas` have shape `[T, B, A]`.
    baseline_values: 2-D Tensor containing an estimate of the state value with
        shape `[T, B]`.
    actions: A (possibly nested structure of) N-D Tensor(s) with shape
        `[T, B, ...]` where the final dimensions are the `event_shape` of the
        corresponding distribution in the nested structure (the shape can be
        just `[T, B]` if the `event_shape` is scalar).
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    policy_vars: An optional (possibly nested structure of) iterables of
        Tensors used by `policies`. If provided is used in scope checks. For
        the multivariate normal example above this would be `[mus, sigmas]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for
        Generalised Advantage Estimation as per
        https://arxiv.org/abs/1506.02438.
    entropy_cost: optional scalar cost that pushes the policy to have high
        entropy, larger values cause higher entropies.
    baseline_cost: scalar cost that scales the derivatives of the baseline
        relative to the policy gradient.
    entropy_scale_op: An optional op that takes `policies` as its only
        argument and returns a scalar Tensor that is used to scale the entropy
        loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of
        dimensions makes entropy loss invariant to the action space dimension.
        See `policy_entropy_loss` for more info.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
    flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
    scoped_values = (flat_policy_vars + nest.flatten(actions) +
                     [baseline_values, rewards, pcontinues, bootstrap_value])
    with tf.name_scope(name, values=scoped_values):
        # Loss for the baseline, summed over the time dimension.
        baseline_loss_td, td_lambda = value_ops.td_lambda(
            baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

        # The TD error provides an estimate of the advantages of the actions.
        advantages = td_lambda.temporal_differences
        baseline_loss = tf.multiply(tf.convert_to_tensor(baseline_cost,
                                                         dtype=tf.float32),
                                    baseline_loss_td,
                                    name="baseline_loss")

        # Loss for the policy. Doesn't push additional gradients through
        # the advantages.
        pg_loss = policy_gradient_loss(policies,
                                       actions,
                                       advantages,
                                       policy_vars,
                                       name="policy_gradient_loss")

        total_loss = tf.add(pg_loss, baseline_loss, name="total_loss")

        if entropy_cost is not None:
            loss, extra = policy_entropy_loss(policies, policy_vars,
                                              entropy_scale_op)
            entropy = tf.reduce_sum(extra.entropy, axis=0,
                                    name="entropy")  # [B].
            entropy_loss = tf.multiply(tf.convert_to_tensor(entropy_cost,
                                                            dtype=tf.float32),
                                       tf.reduce_sum(loss, axis=0),
                                       name="scaled_entropy_loss")  # [B].
            total_loss = tf.add(total_loss,
                                entropy_loss,
                                name="total_loss_with_entropy")
        else:
            entropy = None
            entropy_loss = None

        extra = SequenceA2CExtra(
            entropy=entropy,
            entropy_loss=entropy_loss,
            baseline_loss=baseline_loss,
            policy_gradient_loss=pg_loss,
            advantages=advantages,
            discounted_returns=td_lambda.discounted_returns)
        return base_ops.LossOutput(total_loss, extra)

コード例 #4

ファイルを表示

ファイル: discrete_policy_gradient_ops.py プロジェクト: rickyHong/Tensorflow-Reinforce-Learning-Deepmind-repl

def sequence_advantage_actor_critic_loss(
    policy_logits, baseline_values, actions, rewards,
    pcontinues, bootstrap_value, lambda_=1, entropy_cost=None,
    baseline_cost=1, normalise_entropy=False,
    name="SequenceAdvantageActorCriticLoss"):
  """Calculates the loss for an A2C update along a batch of trajectories.

  Technically A2C is the special case where lambda=1; for general lambda
  this is the loss for Generalized Advantage Estimation (GAE), modulo chunking
  behaviour if passing chunks of episodes (see `generalized_lambda_returns` for
  more detail).

  Note: This function takes policy _logits_ as input, not the log-policy like
  `learning.deepmind.lua.rl.learners.Reinforce` does.

  This loss jointly learns the policy and the baseline. Therefore, gradients
  for this loss flow through each tensor in `policy_logits` and
  `baseline_values`, but no other input tensors. The policy is learnt with the
  advantage actor-critic loss, plus an optional entropy term. The baseline is
  regressed towards the n-step bootstrapped returns given by the
  reward/pcontinue sequence.  The `baseline_cost` parameter scales the
  gradients w.r.t the baseline relative to the policy gradient. i.e:
  `d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `baseline_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories as observed by the agent given the
  sequences of one or more actions sampled from the `policy_logits`.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action `a` transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  This function accepts a nested array of `policy_logits` and `actions` in order
  to allow for multidimensional discrete action spaces. In this case, the loss
  is given by `sum_i(loss(p_i, a_i))` where `p_i` are members of the
  `policy_logits` nest, and `a_i` are members of the `actions` nest.
  We assume that a single baseline is used across all action dimensions for
  each timestep.

  Args:
    policy_logits: A (possibly nested structure of) 3-D Tensor(s) with shape
        `[T, B, num_actions]` and possibly different dimension `num_actions`.
    baseline_values: 2-D Tensor containing an estimate of state values `[T, B]`.
    actions: A (possibly nested structure of) 2-D Tensor(s) with shape
        `[T, B]` and integer type.
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for
        Generalised Advantage Estimation as per
        https://arxiv.org/abs/1506.02438.
    entropy_cost: optional scalar cost that pushes the policy to have high
        entropy, larger values cause higher entropies.
    baseline_cost: scalar cost that scales the derivatives of the baseline
        relative to the policy gradient.
    normalise_entropy: if True, the entropy loss is normalised to the range
        `[-1, 0]` by dividing by the log number of actions. This makes it more
        invariant to the size of the action space. Default is False.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
  scoped_values = (nest.flatten(policy_logits) + nest.flatten(actions) +
                   [baseline_values, rewards, pcontinues, bootstrap_value])
  with tf.name_scope(name, values=scoped_values):
    # Loss for the baseline, summed over the time dimension.
    baseline_loss_td, td_lambda = value_ops.td_lambda(
        baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

    # The TD error provides an estimate of the advantages of the actions.
    advantages = td_lambda.temporal_differences
    baseline_loss = tf.multiply(
        tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
        baseline_loss_td,
        name="baseline_loss")

    # Loss for the policy. Doesn't push additional gradients through
    # the advantages.
    policy_gradient_loss = discrete_policy_gradient_loss(
        policy_logits, actions, advantages, name="policy_gradient_loss")

    total_loss = tf.add(policy_gradient_loss, baseline_loss, name="total_loss")

    if entropy_cost is not None:
      entropy_loss_op, policy_entropy = discrete_policy_entropy_loss(
          policy_logits, normalise=normalise_entropy)  # [T,B].
      entropy = tf.reduce_sum(
          policy_entropy.entropy, axis=0, name="entropy")  # [B].
      entropy_loss = tf.multiply(
          tf.convert_to_tensor(entropy_cost, dtype=tf.float32),
          tf.reduce_sum(entropy_loss_op, axis=0),
          name="scaled_entropy_loss")  # [B].
      total_loss = tf.add(total_loss, entropy_loss,
                          name="total_loss_with_entropy")
    else:
      entropy = None
      entropy_loss = None

    extra = SequenceAdvantageActorCriticExtra(
        entropy=entropy, entropy_loss=entropy_loss,
        baseline_loss=baseline_loss,
        policy_gradient_loss=policy_gradient_loss,
        advantages=advantages,
        discounted_returns=td_lambda.discounted_returns)

    return base_ops.LossOutput(total_loss, extra)

コード例 #5

ファイルを表示

ファイル: policy_gradient_ops.py プロジェクト: wmiao1769/trfl

def sequence_a2c_loss(policies,
                      baseline_values,
                      actions,
                      rewards,
                      pcontinues,
                      bootstrap_value,
                      policy_vars=None,
                      lambda_=1,
                      entropy_cost=None,
                      baseline_cost=1,
                      entropy_scale_op=None,
                      name="SequenceA2CLoss"):
  """Constructs a TensorFlow graph computing the A2C/GAE loss for sequences.

  This loss jointly learns the policy and the baseline. Therefore, gradients
  for this loss flow through each tensor in `policies` and through each tensor
  in `baseline_values`, but no other input tensors. The policy is learnt with
  the advantage actor-critic loss, plus an optional entropy term. The baseline
  is regressed towards the n-step bootstrapped returns given by the
  reward/pcontinue sequence. The `baseline_cost` parameter scales the
  gradients w.r.t the baseline relative to the policy gradient, i.e.
  d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`.

  This function is designed for batches of sequences of data. Tensors are
  assumed to be time major (i.e. the outermost dimension is time, the second
  outermost dimension is the batch dimension). We denote the sequence length in
  the shapes of the arguments with the variable `T`, the batch size with the
  variable `B`, neither of which needs to be known at construction time. Index
  `0` of the time dimension is assumed to be the start of the sequence.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `baseline_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories as observed by the agent given the
  sequences of one or more actions sampled from `policies`.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action `a` transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  For n-dimensional action vectors, a multivariate distribution must be used
  for `policies`. In case there is no multivariate version for the desired
  univariate distribution, or in case the `actions` object is a nested
  structure (e.g. for multiple action types), this function also accepts a
  nested structure  of `policies`. In this case, the loss is given by
  `sum_i(loss(p_i, a_i))` where `p_i` are members of the `policies` nest, and
  `a_i` are members of the `actions` nest. We assume that a single baseline is
  used across all action dimensions for each timestep.

  Args:
    policies: A (possibly nested structure of) distribution(s) supporting
        `batch_shape` and `event_shape` properties & `log_prob` and `entropy`
        methods (e.g. an instance of `tfp.distributions.Distribution`),
        with `batch_shape` equal to `[T, B]`. E.g. for a (non-nested) diagonal
        multivariate gaussian with dimension `A` this would be:
        `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)`
        where `mus` and `sigmas` have shape `[T, B, A]`.
    baseline_values: 2-D Tensor containing an estimate of the state value with
        shape `[T, B]`.
    actions: A (possibly nested structure of) N-D Tensor(s) with shape
        `[T, B, ...]` where the final dimensions are the `event_shape` of the
        corresponding distribution in the nested structure (the shape can be
        just `[T, B]` if the `event_shape` is scalar).
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    policy_vars: An optional (possibly nested structure of) iterables of
        Tensors used by `policies`. If provided is used in scope checks. For
        the multivariate normal example above this would be `[mus, sigmas]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for
        Generalised Advantage Estimation as per
        https://arxiv.org/abs/1506.02438.
    entropy_cost: optional scalar cost that pushes the policy to have high
        entropy, larger values cause higher entropies.
    baseline_cost: scalar cost that scales the derivatives of the baseline
        relative to the policy gradient.
    entropy_scale_op: An optional op that takes `policies` as its only
        argument and returns a scalar Tensor that is used to scale the entropy
        loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of
        dimensions makes entropy loss invariant to the action space dimension.
        See `policy_entropy_loss` for more info.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
  flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
  scoped_values = (flat_policy_vars + nest.flatten(actions) +
                   [baseline_values, rewards, pcontinues, bootstrap_value])
  with tf.name_scope(name, values=scoped_values):
    # Loss for the baseline, summed over the time dimension.
    baseline_loss_td, td_lambda = value_ops.td_lambda(
        baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

    # The TD error provides an estimate of the advantages of the actions.
    advantages = td_lambda.temporal_differences
    baseline_loss = tf.multiply(
        tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
        baseline_loss_td,
        name="baseline_loss")

    # Loss for the policy. Doesn't push additional gradients through
    # the advantages.
    pg_loss = policy_gradient_loss(
        policies, actions, advantages, policy_vars,
        name="policy_gradient_loss")

    total_loss = tf.add(pg_loss, baseline_loss, name="total_loss")

    if entropy_cost is not None:
      loss, extra = policy_entropy_loss(policies, policy_vars, entropy_scale_op)
      entropy = tf.reduce_sum(extra.entropy, axis=0, name="entropy")  # [B].
      entropy_loss = tf.multiply(
          tf.convert_to_tensor(entropy_cost, dtype=tf.float32),
          tf.reduce_sum(loss, axis=0),
          name="scaled_entropy_loss")  # [B].
      total_loss = tf.add(total_loss, entropy_loss,
                          name="total_loss_with_entropy")
    else:
      entropy = None
      entropy_loss = None

    extra = SequenceA2CExtra(
        entropy=entropy,
        entropy_loss=entropy_loss,
        baseline_loss=baseline_loss,
        policy_gradient_loss=pg_loss,
        advantages=advantages,
        discounted_returns=td_lambda.discounted_returns)
    return base_ops.LossOutput(total_loss, extra)