Ejemplo n.º 1
0
def _entropy_loss(distributions, spec, weights=None):
  """Computes entropy loss.

  Args:
    distributions: A possibly batched tuple of distributions.
    spec: A nested tuple representing the action spec.
    weights: Optional scalar or element-wise (per-batch-entry) importance
      weights.  Includes a mask for invalid timesteps.

  Returns:
    A Tensor representing the entropy loss.
  """
  with tf.name_scope('entropy_regularization'):
    entropy = -tf.cast(common.entropy(distributions, spec), tf.float32)
    if weights is not None:
      entropy *= weights
    return tf.reduce_mean(input_tensor=entropy)
Ejemplo n.º 2
0
  def testNestedEntropy(self):
    action_spec = [
        tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1),
        [tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
         tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)]]
    distribution = [
        tfp.distributions.Normal([0.0, 0.0], [1.0, 2.0]),
        [
            tfp.distributions.Normal([0.5], [1.0]),
            tfp.distributions.Normal([-0.5], [2.0])
        ]
    ]
    entropies = common.entropy(distribution, action_spec)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    entropies_ = self.evaluate(entropies)
    self.assertEqual(len(entropies_.shape), 0)
    self.assertNear(entropies_,
                    2.0 + np.log(2 * 3.14) + np.log(8 * 3.14159),
                    0.001)
Ejemplo n.º 3
0
  def testBatchedNestedEntropy(self):
    action_spec = [
        tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1),
        [tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
         tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)]]
    distribution = [
        tfp.distributions.Normal([[0.0, 0.0], [0.0, 0.0]],
                                 [[1.0, 1.0], [2.0, 2.0]]),
        [
            tfp.distributions.Normal([[0.5], [0.5]], [[1.0], [2.0]]),
            tfp.distributions.Normal([[-0.5], [-0.5]], [[1.0], [2.0]])
        ]
    ]
    entropies = common.entropy(distribution, action_spec)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    entropies_ = self.evaluate(entropies)
    self.assertEqual(entropies_.shape, (2,))
    self.assertAllClose(entropies_,
                        [4 * (0.5 + 0.5 * np.log(2 * 3.14159)),
                         4 * (0.5 + 0.5 * np.log(8 * 3.14159))], 0.001)
Ejemplo n.º 4
0
    def policy_gradient_loss(self,
                             time_steps,
                             actions,
                             sample_action_log_probs,
                             advantages,
                             current_policy_distribution,
                             weights,
                             debug_summaries=False):
        """Create tensor for policy gradient loss.

    All tensors should have a single batch dimension.

    Args:
      time_steps: TimeSteps with observations for each timestep.
      actions: Tensor of actions for timesteps, aligned on index.
      sample_action_log_probs: Tensor of sample probability of each action.
      advantages: Tensor of advantage estimate for each timestep, aligned on
        index. Works better when advantage estimates are normalized.
      current_policy_distribution: The policy distribution, evaluated on all
        time_steps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.  Includes a mask for invalid timesteps.
      debug_summaries: True if debug summaries should be created.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
        tf.nest.assert_same_structure(time_steps, self.time_step_spec)
        action_log_prob = common.log_probability(current_policy_distribution,
                                                 actions, self._action_spec)
        action_log_prob = tf.cast(action_log_prob, tf.float32)
        if self._log_prob_clipping > 0.0:
            action_log_prob = tf.clip_by_value(action_log_prob,
                                               -self._log_prob_clipping,
                                               self._log_prob_clipping)
        if self._check_numerics:
            action_log_prob = tf.debugging.check_numerics(
                action_log_prob, 'action_log_prob')

        # Prepare both clipped and unclipped importance ratios.
        importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - self._importance_ratio_clipping,
            1 + self._importance_ratio_clipping)

        if self._check_numerics:
            importance_ratio = tf.debugging.check_numerics(
                importance_ratio, 'importance_ratio')
            if self._importance_ratio_clipping > 0.0:
                importance_ratio_clipped = tf.debugging.check_numerics(
                    importance_ratio_clipped, 'importance_ratio_clipped')

        # Pessimistically choose the minimum objective value for clipped and
        #   unclipped importance ratios.
        per_timestep_objective = importance_ratio * advantages
        per_timestep_objective_clipped = importance_ratio_clipped * advantages
        per_timestep_objective_min = tf.minimum(
            per_timestep_objective, per_timestep_objective_clipped)

        if self._importance_ratio_clipping > 0.0:
            policy_gradient_loss = -per_timestep_objective_min
        else:
            policy_gradient_loss = -per_timestep_objective

        policy_gradient_loss = tf.reduce_mean(
            input_tensor=policy_gradient_loss * weights)

        if debug_summaries:
            if self._importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(input_tensor=tf.cast(
                    tf.greater(tf.abs(importance_ratio - 1.0),
                               self._importance_ratio_clipping), tf.float32))
                tf.compat.v2.summary.scalar(name='clip_fraction',
                                            data=clip_fraction,
                                            step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='action_log_prob',
                                           data=action_log_prob,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='action_log_prob_sample',
                                           data=sample_action_log_probs,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='importance_ratio',
                                           data=importance_ratio,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.scalar(
                name='importance_ratio_mean',
                data=tf.reduce_mean(input_tensor=importance_ratio),
                step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='importance_ratio_clipped',
                                           data=importance_ratio_clipped,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='per_timestep_objective',
                                           data=per_timestep_objective,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(
                name='per_timestep_objective_clipped',
                data=per_timestep_objective_clipped,
                step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='per_timestep_objective_min',
                                           data=per_timestep_objective_min,
                                           step=self.train_step_counter)
            entropy = common.entropy(current_policy_distribution,
                                     self.action_spec)
            tf.compat.v2.summary.histogram(name='policy_entropy',
                                           data=entropy,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.scalar(
                name='policy_entropy_mean',
                data=tf.reduce_mean(input_tensor=entropy),
                step=self.train_step_counter)
            for i, (single_action, single_distribution) in enumerate(
                    zip(tf.nest.flatten(self.action_spec),
                        tf.nest.flatten(current_policy_distribution))):
                # Categorical distribution (used for discrete actions) doesn't have a
                # mean.
                distribution_index = '_{}'.format(i) if i > 0 else ''
                if not tensor_spec.is_discrete(single_action):
                    tf.compat.v2.summary.histogram(
                        name='actions_distribution_mean' + distribution_index,
                        data=single_distribution.mean(),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.histogram(
                        name='actions_distribution_stddev' +
                        distribution_index,
                        data=single_distribution.stddev(),
                        step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='policy_gradient_loss',
                                           data=policy_gradient_loss,
                                           step=self.train_step_counter)

        if self._check_numerics:
            policy_gradient_loss = tf.debugging.check_numerics(
                policy_gradient_loss, 'policy_gradient_loss')

        return policy_gradient_loss
Ejemplo n.º 5
0
    def policy_gradient_loss(self,
                             time_steps,
                             actions,
                             sample_action_log_probs,
                             advantages,
                             current_policy_distribution,
                             valid_mask,
                             debug_summaries=False):
        """Create tensor for policy gradient loss.

    All tensors should have a single batch dimension.

    Args:
      time_steps: TimeSteps with observations for each timestep.
      actions: Tensor of actions for timesteps, aligned on index.
      sample_action_log_probs: Tensor of ample probability of each action.
      advantages: Tensor of advantage estimate for each timestep, aligned on
        index. Works better when advantage estimates are normalized.
      current_policy_distribution: The policy distribution, evaluated on all
        time_steps.
      valid_mask: Mask for invalid timesteps. Float value 1.0 for valid
        timesteps and 0.0 for invalid timesteps. (Timesteps which either are
        betweeen two episodes, or part of an unfinished episode at the end of
        one batch dimension.)
      debug_summaries: True if debug summaries should be created.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
        nest.assert_same_structure(time_steps, self.time_step_spec())
        action_log_prob = common_utils.log_probability(
            current_policy_distribution, actions, self._action_spec)
        action_log_prob = tf.to_float(action_log_prob)
        if self._log_prob_clipping > 0.0:
            action_log_prob = tf.clip_by_value(action_log_prob,
                                               -self._log_prob_clipping,
                                               self._log_prob_clipping)
        if self._check_numerics:
            action_log_prob = tf.check_numerics(action_log_prob,
                                                'action_log_prob')

        # Prepare both clipped and unclipped importance ratios.
        importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - self._importance_ratio_clipping,
            1 + self._importance_ratio_clipping)

        if self._check_numerics:
            importance_ratio = tf.check_numerics(importance_ratio,
                                                 'importance_ratio')
            if self._importance_ratio_clipping > 0.0:
                importance_ratio_clipped = tf.check_numerics(
                    importance_ratio_clipped, 'importance_ratio_clipped')

        # Pessimistically choose the minimum objective value for clipped and
        #   unclipped importance ratios.
        per_timestep_objective = importance_ratio * advantages
        per_timestep_objective_clipped = importance_ratio_clipped * advantages
        per_timestep_objective_min = tf.minimum(
            per_timestep_objective, per_timestep_objective_clipped)

        if self._importance_ratio_clipping > 0.0:
            policy_gradient_loss = -per_timestep_objective_min
        else:
            policy_gradient_loss = -per_timestep_objective
        policy_gradient_loss = tf.reduce_mean(policy_gradient_loss *
                                              valid_mask)

        if debug_summaries:
            if self._importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(
                    tf.to_float(
                        tf.greater(tf.abs(importance_ratio - 1.0),
                                   self._importance_ratio_clipping)))
                tf.contrib.summary.scalar('clip_fraction', clip_fraction)
            tf.contrib.summary.histogram('action_log_prob', action_log_prob)
            tf.contrib.summary.histogram('action_log_prob_sample',
                                         sample_action_log_probs)
            tf.contrib.summary.histogram('importance_ratio', importance_ratio)
            tf.contrib.summary.scalar('importance_ratio_mean',
                                      tf.reduce_mean(importance_ratio))
            tf.contrib.summary.histogram('importance_ratio_clipped',
                                         importance_ratio_clipped)
            tf.contrib.summary.histogram('per_timestep_objective',
                                         per_timestep_objective)
            tf.contrib.summary.histogram('per_timestep_objective_clipped',
                                         per_timestep_objective_clipped)
            tf.contrib.summary.histogram('per_timestep_objective_min',
                                         per_timestep_objective_min)
            entropy = common_utils.entropy(current_policy_distribution,
                                           self.action_spec())
            tf.contrib.summary.histogram('policy_entropy', entropy)
            tf.contrib.summary.scalar('policy_entropy_mean',
                                      tf.reduce_mean(entropy))
            # Categorical distribution (used for discrete actions)
            # doesn't have a mean.
            if not self.action_spec().is_discrete():
                tf.contrib.summary.histogram(
                    'actions_distribution_mean',
                    current_policy_distribution.mean())
                tf.contrib.summary.histogram(
                    'actions_distribution_stddev',
                    current_policy_distribution.stddev())
            tf.contrib.summary.histogram('policy_gradient_loss',
                                         policy_gradient_loss)

        if self._check_numerics:
            policy_gradient_loss = tf.check_numerics(policy_gradient_loss,
                                                     'policy_gradient_loss')

        return policy_gradient_loss