def calc_loss(self, training_info: TrainingInfo): info = training_info.info # SarsaInfo critic_loss = losses.element_wise_squared_loss(info.returns, info.critic) not_first_step = tf.not_equal(training_info.step_type, StepType.FIRST) critic_loss *= tf.cast(not_first_step, tf.float32) def _summary(): with self.name_scope: tf.summary.scalar("values", tf.reduce_mean(info.critic)) tf.summary.scalar("returns", tf.reduce_mean(info.returns)) safe_mean_hist_summary("td_error", info.returns - info.critic) tf.summary.scalar( "explained_variance_of_return_by_value", common.explained_variance(info.critic, info.returns)) if self._debug_summaries: common.run_if(common.should_record_summaries(), _summary) return LossInfo( loss=info.actor_loss, # put critic_loss to scalar_loss because loss will be masked by # ~is_last at train_complete(). The critic_loss here should be # masked by ~is_first instead, which is done above. scalar_loss=tf.reduce_mean(critic_loss), extra=SarsaLossInfo(actor=info.actor_loss, critic=critic_loss))
def _pg_loss(self, training_info: TrainingInfo, advantages): scope = tf.name_scope(self.__class__.__name__) importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio( action_distribution=training_info.action_distribution, collect_action_distribution=training_info. collect_action_distribution, action=training_info.action, action_spec=self._action_spec, clipping_mode='double_sided', scope=scope, importance_ratio_clipping=self._importance_ratio_clipping, log_prob_clipping=self._log_prob_clipping, check_numerics=self._check_numerics, debug_summaries=self._debug_summaries) # Pessimistically choose the maximum objective value for clipped and # unclipped importance ratios. pg_objective = -importance_ratio * advantages pg_objective_clipped = -importance_ratio_clipped * advantages policy_gradient_loss = tf.maximum(pg_objective, pg_objective_clipped) def _summary(): with scope: tf.summary.histogram('pg_objective', pg_objective) tf.summary.histogram('pg_objective_clipped', pg_objective_clipped) if self._debug_summaries: common.run_if(common.should_record_summaries(), _summary) if self._check_numerics: policy_gradient_loss = tf.debugging.check_numerics( policy_gradient_loss, 'policy_gradient_loss') return policy_gradient_loss
def calc_loss(self, training_info: EntropyTargetInfo, valid_mask=None): loss_info = training_info.loss mask = tf.cast(training_info.step_type != StepType.LAST, tf.float32) if valid_mask: mask = mask * tf.cast(valid_mask, tf.float32) entropy = -loss_info.extra.neg_entropy * mask num = tf.reduce_sum(mask) not_empty = num > 0 num = tf.maximum(num, 1) entropy2 = tf.reduce_sum(tf.square(entropy)) / num entropy = tf.reduce_sum(entropy) / num entropy_std = tf.sqrt(tf.maximum(0.0, entropy2 - entropy * entropy)) run_if(not_empty, lambda: self.adjust_alpha(entropy)) def _summarize(): with self.name_scope: tf.summary.scalar("entropy_std", entropy_std) if self._debug_summaries: run_if( tf.logical_and(not_empty, should_record_summaries()), _summarize) alpha = tf.exp(self._log_alpha) return loss_info._replace(loss=loss_info.loss * alpha)
def __call__(self, training_info: TrainingInfo, value): """Cacluate actor critic loss The first dimension of all the tensors is time dimension and the second dimesion is the batch dimension. Args: training_info (TrainingInfo): training_info collected by (On/Off)PolicyDriver. All tensors in training_info are time-major value (tf.Tensor): the time-major tensor for the value at each time step final_value (tf.Tensor): the value at one step ahead. Returns: loss_info (LossInfo): with loss_info.extra being ActorCriticLossInfo """ returns, advantages = self._calc_returns_and_advantages( training_info, value) def _summary(): with tf.name_scope('ActorCriticLoss'): tf.summary.scalar("values", tf.reduce_mean(value)) tf.summary.scalar("returns", tf.reduce_mean(returns)) tf.summary.scalar("advantages/mean", tf.reduce_mean(advantages)) tf.summary.histogram("advantages/value", advantages) tf.summary.scalar("explained_variance_of_return_by_value", common.explained_variance(value, returns)) if self._debug_summaries: common.run_if(common.should_record_summaries(), _summary) if self._normalize_advantages: advantages = _normalize_advantages(advantages, axes=(0, 1)) if self._advantage_clip: advantages = tf.clip_by_value(advantages, -self._advantage_clip, self._advantage_clip) pg_loss = self._pg_loss(training_info, tf.stop_gradient(advantages)) td_loss = self._td_error_loss_fn(tf.stop_gradient(returns), value) loss = pg_loss + self._td_loss_weight * td_loss entropy_loss = () if self._entropy_regularization is not None: entropy, entropy_for_gradient = dist_utils.entropy_with_fallback( training_info.action_distribution, self._action_spec) entropy_loss = -entropy loss -= self._entropy_regularization * entropy_for_gradient return LossInfo(loss=loss, extra=ActorCriticLossInfo(td_loss=td_loss, pg_loss=pg_loss, entropy_loss=entropy_loss))
def train(self, num_updates=1, mini_batch_size=None, mini_batch_length=None, whole_replay_buffer_training=True, clear_replay_buffer=True, update_counter_every_mini_batch=False): """Train algorithm. Args: num_updates (int): number of optimization steps mini_batch_size (int): number of sequences for each minibatch mini_batch_length (int): the length of the sequence for each sample in the minibatch whole_replay_buffer_training (bool): whether use all data in replay buffer to perform one update clear_replay_buffer (bool): whether wiped clean replay buffer; this flag only takes effect if whole_replay_buffer_training is True update_counter_every_mini_batch (bool): whether to update counter for every mini batch. The `summary_interval` is based on this counter. Typically, this should be False. Set to True if you want to have summary for every mini batch for the purpose of debugging. Returns: train_steps (int): the actual number of time steps that have been trained (a step might be trained multiple times) """ if mini_batch_size is None: mini_batch_size = self._exp_replayer.batch_size if whole_replay_buffer_training: experience = self._exp_replayer.replay_all() if clear_replay_buffer: self._exp_replayer.clear() else: experience = self._exp_replayer.replay( sample_batch_size=mini_batch_size, mini_batch_length=mini_batch_length) # We pass in an explicit value of should_summarize so that TF can # compile two different versions of _train(), one with # should_summarize=True, the other with should_summarize=False. # Even though should_summarize=True or False should not make any # difference (should_record_summaries() is checked before generating # summaries in add_gradients_summaries() and add_variables_summaries()), # somehow, TF is very slow (30% in my case) when TrainerConfig.summarize_grads_and_vars # is True and summary_interval very large if we do not explicitly pass # in should_summarize. return self._train( experience, num_updates, mini_batch_size, mini_batch_length, update_counter_every_mini_batch, should_summarize=bool(common.should_record_summaries()) or update_counter_every_mini_batch)
def calc_loss(self, training_info: EntropyTargetInfo): loss_info = training_info.loss mask = tf.cast(training_info.step_type != StepType.LAST, tf.float32) entropy = -loss_info.extra.entropy_loss * mask num = tf.reduce_sum(mask) entropy2 = tf.reduce_sum(tf.square(entropy)) / num entropy = tf.reduce_sum(entropy) / num entropy_std = tf.sqrt(tf.maximum(0.0, entropy2 - entropy * entropy)) prev_avg_entropy = self._avg_entropy.get() avg_entropy = self._avg_entropy.average(entropy) def _init(): crossing = avg_entropy < self._target_entropy self._stage.assign_add(tf.cast(crossing, tf.int32)) def _adjust(): previous_above = tf.cast(self._stage, tf.bool) above = avg_entropy > self._target_entropy self._stage.assign(tf.cast(above, tf.int32)) crossing = above != previous_above update_rate = self._update_rate update_rate = tf.where(crossing, 0.9 * update_rate, update_rate) update_rate = tf.maximum(update_rate, self._slow_update_rate) update_rate = tf.where(entropy < self._fast_stage_thresh, np.float32(self._fast_update_rate), update_rate) self._update_rate.assign(update_rate) above = tf.cast(above, tf.float32) below = 1 - above increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32) decreasing = 1 - increasing log_alpha = self._log_alpha + ( (below + 0.5 * above) * decreasing - (above + 0.5 * below) * increasing) * update_rate log_alpha = tf.maximum(log_alpha, np.float32(self._min_log_alpha)) self._log_alpha.assign(log_alpha) run_if(self._stage == -1, _init) run_if(self._stage >= 0, _adjust) alpha = tf.exp(self._log_alpha) def _summarize(): with self.name_scope: tf.summary.scalar("alpha", alpha) tf.summary.scalar("entropy_std", entropy_std) tf.summary.scalar("avg_entropy", avg_entropy) tf.summary.scalar("stage", self._stage) tf.summary.scalar("update_rate", self._update_rate) if self._debug_summaries: run_if(should_record_summaries(), _summarize) return loss_info._replace(loss=loss_info.loss * alpha)
def after_train(self, training_info): """Adjust actor parameter according to KL-divergence.""" exp_array = TracExperience( observation=training_info.info.observation, step_type=training_info.step_type, action_param=common.get_distribution_params( training_info.action_distribution), state=training_info.info.state) exp_array = common.create_and_unstack_tensor_array( exp_array, clear_after_read=False) dists, steps = self._trusted_updater.adjust_step( lambda: self._calc_change(exp_array), self._action_dist_clips) def _summarize(): with self.name_scope: for i, d in enumerate(tf.nest.flatten(dists)): tf.summary.scalar("unadjusted_action_dist/%s" % i, d) tf.summary.scalar("adjust_steps", steps) common.run_if(common.should_record_summaries(), _summarize) self._ac_algorithm.after_train( training_info._replace(info=training_info.info.ac))
def action_importance_ratio(action_distribution, collect_action_distribution, action, action_spec, clipping_mode, scope, importance_ratio_clipping, log_prob_clipping, check_numerics, debug_summaries): """ ratio for importance sampling, used in PPO loss and vtrace loss. Caller has to save tf.name_scope() and pass scope to this function. Args: action_distribution (nested tf.distribution): Distribution over actions under target policy. collect_action_distribution (nested tf.distribution): distribution over actions from behavior policy, used to sample actions for the rollout. action (nested tf.distribution): possibly batched action tuple taken during rollout. action_spec (nested BoundedTensorSpec): representing the actions. clipping_mode (str): mode for clipping the importance ratio. 'double_sided': clips the range of importance ratio into [1-importance_ratio_clipping, 1+importance_ratio_clipping], which is used by PPOLoss. 'capping': clips the range of importance ratio into min(1+importance_ratio_clipping, importance_ratio), which is used by VTraceLoss, where c_bar or rho_bar = 1+importance_ratio_clipping. scope (name scope manager): returned by tf.name_scope(), set outside. importance_ratio_clipping (float): Epsilon in clipped, surrogate PPO objective. See the cited paper for more detail. log_prob_clipping (float): If >0, clipping log probs to the range (-log_prob_clipping, log_prob_clipping) to prevent inf / NaN values. check_numerics (bool): If true, adds tf.debugging.check_numerics to help find NaN / Inf values. For debugging only. debug_summaries (bool): If true, output summary metrics to tf. Returns: importance_ratio (Tensor), importance_ratio_clipped (Tensor). """ current_policy_distribution = action_distribution sample_action_log_probs = tfa_common.log_probability( collect_action_distribution, action, action_spec) sample_action_log_probs = tf.stop_gradient(sample_action_log_probs) action_log_prob = tfa_common.log_probability(current_policy_distribution, action, action_spec) if log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -log_prob_clipping, log_prob_clipping) if check_numerics: action_log_prob = tf.debugging.check_numerics(action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) if check_numerics: importance_ratio = tf.debugging.check_numerics(importance_ratio, 'importance_ratio') if clipping_mode == 'double_sided': importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - importance_ratio_clipping, 1 + importance_ratio_clipping) elif clipping_mode == 'capping': importance_ratio_clipped = tf.minimum(importance_ratio, 1 + importance_ratio_clipping) else: raise Exception('Unsupported clipping mode: ' + clipping_mode) def _summary(): with scope: if importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(importance_ratio - 1.0), importance_ratio_clipping), tf.float32)) tf.summary.scalar('clip_fraction', clip_fraction) tf.summary.histogram('action_log_prob', action_log_prob) tf.summary.histogram('action_log_prob_sample', sample_action_log_probs) tf.summary.histogram('importance_ratio', importance_ratio) tf.summary.scalar('importance_ratio_mean', tf.reduce_mean(input_tensor=importance_ratio)) tf.summary.histogram('importance_ratio_clipped', importance_ratio_clipped) if debug_summaries: common.run_if(common.should_record_summaries(), _summary) return importance_ratio, importance_ratio_clipped
def adjust_alpha(self, entropy): """Adjust alpha according to the current entropy. Args: entropy (scalar Tensor). the current entropy. Returns: adjusted entropy regularization """ prev_avg_entropy = self._avg_entropy.get() avg_entropy = self._avg_entropy.average(entropy) def _init_entropy(): self._max_entropy.assign( tf.minimum(0.8 * avg_entropy, avg_entropy / 0.8)) self._stage.assign_add(1) def _init(): below = avg_entropy < self._max_entropy increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32) # -1 * increasing + 0.5 * (1 - increasing) update_rate = ( 0.5 - 1.5 * increasing) * self._very_slow_update_rate self._stage.assign_add(tf.cast(below, tf.int32)) self._log_alpha.assign( tf.maximum(self._log_alpha + update_rate, np.float32(self._min_log_alpha))) def _free(): crossing = avg_entropy < self._target_entropy self._stage.assign_add(tf.cast(crossing, tf.int32)) def _adjust(): previous_above = tf.cast(self._stage, tf.bool) above = avg_entropy > self._target_entropy self._stage.assign(tf.cast(above, tf.int32)) crossing = above != previous_above update_rate = self._update_rate update_rate = tf.where(crossing, 0.9 * update_rate, update_rate) update_rate = tf.maximum(update_rate, self._slow_update_rate) update_rate = tf.where(entropy < self._fast_stage_thresh, np.float32(self._fast_update_rate), update_rate) self._update_rate.assign(update_rate) above = tf.cast(above, tf.float32) below = 1 - above increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32) decreasing = 1 - increasing log_alpha = self._log_alpha + ( (below + 0.5 * above) * decreasing - (above + 0.5 * below) * increasing) * update_rate log_alpha = tf.maximum(log_alpha, np.float32(self._min_log_alpha)) self._log_alpha.assign(log_alpha) run_if(self._stage < -2, _init_entropy) run_if(self._stage == -2, _init) run_if(self._stage == -1, _free) run_if(self._stage >= 0, _adjust) alpha = tf.exp(self._log_alpha) def _summarize(): with self.name_scope: tf.summary.scalar("alpha", alpha) tf.summary.scalar("avg_entropy", avg_entropy) tf.summary.scalar("stage", self._stage) tf.summary.scalar("update_rate", self._update_rate) if self._debug_summaries: run_if(should_record_summaries(), _summarize) return alpha