def _entropy_loss(distributions, spec, weights=None): """Computes entropy loss. Args: distributions: A possibly batched tuple of distributions. spec: A nested tuple representing the action spec. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Includes a mask for invalid timesteps. Returns: A Tensor representing the entropy loss. """ with tf.name_scope('entropy_regularization'): entropy = -tf.cast(common.entropy(distributions, spec), tf.float32) if weights is not None: entropy *= weights return tf.reduce_mean(input_tensor=entropy)
def testNestedEntropy(self): action_spec = [ tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1), [tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)]] distribution = [ tfp.distributions.Normal([0.0, 0.0], [1.0, 2.0]), [ tfp.distributions.Normal([0.5], [1.0]), tfp.distributions.Normal([-0.5], [2.0]) ] ] entropies = common.entropy(distribution, action_spec) self.evaluate(tf.compat.v1.global_variables_initializer()) entropies_ = self.evaluate(entropies) self.assertEqual(len(entropies_.shape), 0) self.assertNear(entropies_, 2.0 + np.log(2 * 3.14) + np.log(8 * 3.14159), 0.001)
def testBatchedNestedEntropy(self): action_spec = [ tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1), [tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)]] distribution = [ tfp.distributions.Normal([[0.0, 0.0], [0.0, 0.0]], [[1.0, 1.0], [2.0, 2.0]]), [ tfp.distributions.Normal([[0.5], [0.5]], [[1.0], [2.0]]), tfp.distributions.Normal([[-0.5], [-0.5]], [[1.0], [2.0]]) ] ] entropies = common.entropy(distribution, action_spec) self.evaluate(tf.compat.v1.global_variables_initializer()) entropies_ = self.evaluate(entropies) self.assertEqual(entropies_.shape, (2,)) self.assertAllClose(entropies_, [4 * (0.5 + 0.5 * np.log(2 * 3.14159)), 4 * (0.5 + 0.5 * np.log(8 * 3.14159))], 0.001)
def policy_gradient_loss(self, time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights, debug_summaries=False): """Create tensor for policy gradient loss. All tensors should have a single batch dimension. Args: time_steps: TimeSteps with observations for each timestep. actions: Tensor of actions for timesteps, aligned on index. sample_action_log_probs: Tensor of sample probability of each action. advantages: Tensor of advantage estimate for each timestep, aligned on index. Works better when advantage estimates are normalized. current_policy_distribution: The policy distribution, evaluated on all time_steps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Includes a mask for invalid timesteps. debug_summaries: True if debug summaries should be created. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ tf.nest.assert_same_structure(time_steps, self.time_step_spec) action_log_prob = common.log_probability(current_policy_distribution, actions, self._action_spec) action_log_prob = tf.cast(action_log_prob, tf.float32) if self._log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -self._log_prob_clipping, self._log_prob_clipping) if self._check_numerics: action_log_prob = tf.debugging.check_numerics( action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - self._importance_ratio_clipping, 1 + self._importance_ratio_clipping) if self._check_numerics: importance_ratio = tf.debugging.check_numerics( importance_ratio, 'importance_ratio') if self._importance_ratio_clipping > 0.0: importance_ratio_clipped = tf.debugging.check_numerics( importance_ratio_clipped, 'importance_ratio_clipped') # Pessimistically choose the minimum objective value for clipped and # unclipped importance ratios. per_timestep_objective = importance_ratio * advantages per_timestep_objective_clipped = importance_ratio_clipped * advantages per_timestep_objective_min = tf.minimum( per_timestep_objective, per_timestep_objective_clipped) if self._importance_ratio_clipping > 0.0: policy_gradient_loss = -per_timestep_objective_min else: policy_gradient_loss = -per_timestep_objective policy_gradient_loss = tf.reduce_mean( input_tensor=policy_gradient_loss * weights) if debug_summaries: if self._importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(importance_ratio - 1.0), self._importance_ratio_clipping), tf.float32)) tf.compat.v2.summary.scalar(name='clip_fraction', data=clip_fraction, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='action_log_prob', data=action_log_prob, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='action_log_prob_sample', data=sample_action_log_probs, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='importance_ratio', data=importance_ratio, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='importance_ratio_mean', data=tf.reduce_mean(input_tensor=importance_ratio), step=self.train_step_counter) tf.compat.v2.summary.histogram(name='importance_ratio_clipped', data=importance_ratio_clipped, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='per_timestep_objective', data=per_timestep_objective, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='per_timestep_objective_clipped', data=per_timestep_objective_clipped, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='per_timestep_objective_min', data=per_timestep_objective_min, step=self.train_step_counter) entropy = common.entropy(current_policy_distribution, self.action_spec) tf.compat.v2.summary.histogram(name='policy_entropy', data=entropy, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='policy_entropy_mean', data=tf.reduce_mean(input_tensor=entropy), step=self.train_step_counter) for i, (single_action, single_distribution) in enumerate( zip(tf.nest.flatten(self.action_spec), tf.nest.flatten(current_policy_distribution))): # Categorical distribution (used for discrete actions) doesn't have a # mean. distribution_index = '_{}'.format(i) if i > 0 else '' if not tensor_spec.is_discrete(single_action): tf.compat.v2.summary.histogram( name='actions_distribution_mean' + distribution_index, data=single_distribution.mean(), step=self.train_step_counter) tf.compat.v2.summary.histogram( name='actions_distribution_stddev' + distribution_index, data=single_distribution.stddev(), step=self.train_step_counter) tf.compat.v2.summary.histogram(name='policy_gradient_loss', data=policy_gradient_loss, step=self.train_step_counter) if self._check_numerics: policy_gradient_loss = tf.debugging.check_numerics( policy_gradient_loss, 'policy_gradient_loss') return policy_gradient_loss
def policy_gradient_loss(self, time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, valid_mask, debug_summaries=False): """Create tensor for policy gradient loss. All tensors should have a single batch dimension. Args: time_steps: TimeSteps with observations for each timestep. actions: Tensor of actions for timesteps, aligned on index. sample_action_log_probs: Tensor of ample probability of each action. advantages: Tensor of advantage estimate for each timestep, aligned on index. Works better when advantage estimates are normalized. current_policy_distribution: The policy distribution, evaluated on all time_steps. valid_mask: Mask for invalid timesteps. Float value 1.0 for valid timesteps and 0.0 for invalid timesteps. (Timesteps which either are betweeen two episodes, or part of an unfinished episode at the end of one batch dimension.) debug_summaries: True if debug summaries should be created. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ nest.assert_same_structure(time_steps, self.time_step_spec()) action_log_prob = common_utils.log_probability( current_policy_distribution, actions, self._action_spec) action_log_prob = tf.to_float(action_log_prob) if self._log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -self._log_prob_clipping, self._log_prob_clipping) if self._check_numerics: action_log_prob = tf.check_numerics(action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - self._importance_ratio_clipping, 1 + self._importance_ratio_clipping) if self._check_numerics: importance_ratio = tf.check_numerics(importance_ratio, 'importance_ratio') if self._importance_ratio_clipping > 0.0: importance_ratio_clipped = tf.check_numerics( importance_ratio_clipped, 'importance_ratio_clipped') # Pessimistically choose the minimum objective value for clipped and # unclipped importance ratios. per_timestep_objective = importance_ratio * advantages per_timestep_objective_clipped = importance_ratio_clipped * advantages per_timestep_objective_min = tf.minimum( per_timestep_objective, per_timestep_objective_clipped) if self._importance_ratio_clipping > 0.0: policy_gradient_loss = -per_timestep_objective_min else: policy_gradient_loss = -per_timestep_objective policy_gradient_loss = tf.reduce_mean(policy_gradient_loss * valid_mask) if debug_summaries: if self._importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean( tf.to_float( tf.greater(tf.abs(importance_ratio - 1.0), self._importance_ratio_clipping))) tf.contrib.summary.scalar('clip_fraction', clip_fraction) tf.contrib.summary.histogram('action_log_prob', action_log_prob) tf.contrib.summary.histogram('action_log_prob_sample', sample_action_log_probs) tf.contrib.summary.histogram('importance_ratio', importance_ratio) tf.contrib.summary.scalar('importance_ratio_mean', tf.reduce_mean(importance_ratio)) tf.contrib.summary.histogram('importance_ratio_clipped', importance_ratio_clipped) tf.contrib.summary.histogram('per_timestep_objective', per_timestep_objective) tf.contrib.summary.histogram('per_timestep_objective_clipped', per_timestep_objective_clipped) tf.contrib.summary.histogram('per_timestep_objective_min', per_timestep_objective_min) entropy = common_utils.entropy(current_policy_distribution, self.action_spec()) tf.contrib.summary.histogram('policy_entropy', entropy) tf.contrib.summary.scalar('policy_entropy_mean', tf.reduce_mean(entropy)) # Categorical distribution (used for discrete actions) # doesn't have a mean. if not self.action_spec().is_discrete(): tf.contrib.summary.histogram( 'actions_distribution_mean', current_policy_distribution.mean()) tf.contrib.summary.histogram( 'actions_distribution_stddev', current_policy_distribution.stddev()) tf.contrib.summary.histogram('policy_gradient_loss', policy_gradient_loss) if self._check_numerics: policy_gradient_loss = tf.check_numerics(policy_gradient_loss, 'policy_gradient_loss') return policy_gradient_loss