Python should_record_summaries 예제들, alf.utils.common.should_record_summaries Python 예제들

예제 #1

0

파일 보기

파일: sarsa_algorithm.py 프로젝트: emailweixu/alf

    def calc_loss(self, training_info: TrainingInfo):
        info = training_info.info  # SarsaInfo
        critic_loss = losses.element_wise_squared_loss(info.returns,
                                                       info.critic)
        not_first_step = tf.not_equal(training_info.step_type, StepType.FIRST)
        critic_loss *= tf.cast(not_first_step, tf.float32)

        def _summary():
            with self.name_scope:
                tf.summary.scalar("values", tf.reduce_mean(info.critic))
                tf.summary.scalar("returns", tf.reduce_mean(info.returns))
                safe_mean_hist_summary("td_error", info.returns - info.critic)
                tf.summary.scalar(
                    "explained_variance_of_return_by_value",
                    common.explained_variance(info.critic, info.returns))

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        return LossInfo(
            loss=info.actor_loss,
            # put critic_loss to scalar_loss because loss will be masked by
            # ~is_last at train_complete(). The critic_loss here should be
            # masked by ~is_first instead, which is done above.
            scalar_loss=tf.reduce_mean(critic_loss),
            extra=SarsaLossInfo(actor=info.actor_loss, critic=critic_loss))

예제 #2

0

파일 보기

    def _pg_loss(self, training_info: TrainingInfo, advantages):
        scope = tf.name_scope(self.__class__.__name__)
        importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio(
            action_distribution=training_info.action_distribution,
            collect_action_distribution=training_info.
            collect_action_distribution,
            action=training_info.action,
            action_spec=self._action_spec,
            clipping_mode='double_sided',
            scope=scope,
            importance_ratio_clipping=self._importance_ratio_clipping,
            log_prob_clipping=self._log_prob_clipping,
            check_numerics=self._check_numerics,
            debug_summaries=self._debug_summaries)
        # Pessimistically choose the maximum objective value for clipped and
        # unclipped importance ratios.
        pg_objective = -importance_ratio * advantages
        pg_objective_clipped = -importance_ratio_clipped * advantages
        policy_gradient_loss = tf.maximum(pg_objective, pg_objective_clipped)

        def _summary():
            with scope:
                tf.summary.histogram('pg_objective', pg_objective)
                tf.summary.histogram('pg_objective_clipped',
                                     pg_objective_clipped)

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        if self._check_numerics:
            policy_gradient_loss = tf.debugging.check_numerics(
                policy_gradient_loss, 'policy_gradient_loss')

        return policy_gradient_loss

예제 #3

0

파일 보기

    def calc_loss(self, training_info: EntropyTargetInfo, valid_mask=None):
        loss_info = training_info.loss
        mask = tf.cast(training_info.step_type != StepType.LAST, tf.float32)
        if valid_mask:
            mask = mask * tf.cast(valid_mask, tf.float32)
        entropy = -loss_info.extra.neg_entropy * mask
        num = tf.reduce_sum(mask)
        not_empty = num > 0
        num = tf.maximum(num, 1)
        entropy2 = tf.reduce_sum(tf.square(entropy)) / num
        entropy = tf.reduce_sum(entropy) / num
        entropy_std = tf.sqrt(tf.maximum(0.0, entropy2 - entropy * entropy))

        run_if(not_empty, lambda: self.adjust_alpha(entropy))

        def _summarize():
            with self.name_scope:
                tf.summary.scalar("entropy_std", entropy_std)

        if self._debug_summaries:
            run_if(
                tf.logical_and(not_empty, should_record_summaries()),
                _summarize)

        alpha = tf.exp(self._log_alpha)
        return loss_info._replace(loss=loss_info.loss * alpha)

예제 #4

0

파일 보기

    def __call__(self, training_info: TrainingInfo, value):
        """Cacluate actor critic loss

        The first dimension of all the tensors is time dimension and the second
        dimesion is the batch dimension.

        Args:
            training_info (TrainingInfo): training_info collected by
                (On/Off)PolicyDriver. All tensors in training_info are time-major
            value (tf.Tensor): the time-major tensor for the value at each time
                step
            final_value (tf.Tensor): the value at one step ahead.
        Returns:
            loss_info (LossInfo): with loss_info.extra being ActorCriticLossInfo
        """

        returns, advantages = self._calc_returns_and_advantages(
            training_info, value)

        def _summary():
            with tf.name_scope('ActorCriticLoss'):
                tf.summary.scalar("values", tf.reduce_mean(value))
                tf.summary.scalar("returns", tf.reduce_mean(returns))
                tf.summary.scalar("advantages/mean",
                                  tf.reduce_mean(advantages))
                tf.summary.histogram("advantages/value", advantages)
                tf.summary.scalar("explained_variance_of_return_by_value",
                                  common.explained_variance(value, returns))

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        if self._normalize_advantages:
            advantages = _normalize_advantages(advantages, axes=(0, 1))

        if self._advantage_clip:
            advantages = tf.clip_by_value(advantages, -self._advantage_clip,
                                          self._advantage_clip)

        pg_loss = self._pg_loss(training_info, tf.stop_gradient(advantages))

        td_loss = self._td_error_loss_fn(tf.stop_gradient(returns), value)

        loss = pg_loss + self._td_loss_weight * td_loss

        entropy_loss = ()
        if self._entropy_regularization is not None:
            entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
                training_info.action_distribution, self._action_spec)
            entropy_loss = -entropy
            loss -= self._entropy_regularization * entropy_for_gradient

        return LossInfo(loss=loss,
                        extra=ActorCriticLossInfo(td_loss=td_loss,
                                                  pg_loss=pg_loss,
                                                  entropy_loss=entropy_loss))

예제 #5

0

파일 보기

    def train(self,
              num_updates=1,
              mini_batch_size=None,
              mini_batch_length=None,
              whole_replay_buffer_training=True,
              clear_replay_buffer=True,
              update_counter_every_mini_batch=False):
        """Train algorithm.

        Args:
            num_updates (int): number of optimization steps
            mini_batch_size (int): number of sequences for each minibatch
            mini_batch_length (int): the length of the sequence for each
                sample in the minibatch
            whole_replay_buffer_training (bool): whether use all data in replay
                buffer to perform one update
            clear_replay_buffer (bool): whether wiped clean replay buffer; this
                flag only takes effect if whole_replay_buffer_training is True
            update_counter_every_mini_batch (bool): whether to update counter
                for every mini batch. The `summary_interval` is based on this
                counter. Typically, this should be False. Set to True if you
                want to have summary for every mini batch for the purpose of
                debugging.
        Returns:
            train_steps (int): the actual number of time steps that have been
                trained (a step might be trained multiple times)
        """
        if mini_batch_size is None:
            mini_batch_size = self._exp_replayer.batch_size
        if whole_replay_buffer_training:
            experience = self._exp_replayer.replay_all()
            if clear_replay_buffer:
                self._exp_replayer.clear()
        else:
            experience = self._exp_replayer.replay(
                sample_batch_size=mini_batch_size,
                mini_batch_length=mini_batch_length)
        # We pass in an explicit value of should_summarize so that TF can
        # compile two different versions of _train(), one with
        # should_summarize=True, the other with should_summarize=False.
        # Even though should_summarize=True or False should not make any
        # difference (should_record_summaries() is checked before generating
        # summaries in add_gradients_summaries() and add_variables_summaries()),
        # somehow, TF is very slow (30% in my case) when TrainerConfig.summarize_grads_and_vars
        # is True and summary_interval very large if we do not explicitly pass
        # in should_summarize.
        return self._train(
            experience,
            num_updates,
            mini_batch_size,
            mini_batch_length,
            update_counter_every_mini_batch,
            should_summarize=bool(common.should_record_summaries())
            or update_counter_every_mini_batch)

예제 #6

0

파일 보기

파일: entropy_target_algorithm.py 프로젝트: ruizhaogit/alf

    def calc_loss(self, training_info: EntropyTargetInfo):
        loss_info = training_info.loss
        mask = tf.cast(training_info.step_type != StepType.LAST, tf.float32)
        entropy = -loss_info.extra.entropy_loss * mask
        num = tf.reduce_sum(mask)
        entropy2 = tf.reduce_sum(tf.square(entropy)) / num
        entropy = tf.reduce_sum(entropy) / num
        entropy_std = tf.sqrt(tf.maximum(0.0, entropy2 - entropy * entropy))
        prev_avg_entropy = self._avg_entropy.get()
        avg_entropy = self._avg_entropy.average(entropy)

        def _init():
            crossing = avg_entropy < self._target_entropy
            self._stage.assign_add(tf.cast(crossing, tf.int32))

        def _adjust():
            previous_above = tf.cast(self._stage, tf.bool)
            above = avg_entropy > self._target_entropy
            self._stage.assign(tf.cast(above, tf.int32))
            crossing = above != previous_above
            update_rate = self._update_rate
            update_rate = tf.where(crossing, 0.9 * update_rate, update_rate)
            update_rate = tf.maximum(update_rate, self._slow_update_rate)
            update_rate = tf.where(entropy < self._fast_stage_thresh,
                                   np.float32(self._fast_update_rate),
                                   update_rate)
            self._update_rate.assign(update_rate)
            above = tf.cast(above, tf.float32)
            below = 1 - above
            increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32)
            decreasing = 1 - increasing
            log_alpha = self._log_alpha + (
                (below + 0.5 * above) * decreasing -
                (above + 0.5 * below) * increasing) * update_rate
            log_alpha = tf.maximum(log_alpha, np.float32(self._min_log_alpha))
            self._log_alpha.assign(log_alpha)

        run_if(self._stage == -1, _init)
        run_if(self._stage >= 0, _adjust)
        alpha = tf.exp(self._log_alpha)

        def _summarize():
            with self.name_scope:
                tf.summary.scalar("alpha", alpha)
                tf.summary.scalar("entropy_std", entropy_std)
                tf.summary.scalar("avg_entropy", avg_entropy)
                tf.summary.scalar("stage", self._stage)
                tf.summary.scalar("update_rate", self._update_rate)

        if self._debug_summaries:
            run_if(should_record_summaries(), _summarize)

        return loss_info._replace(loss=loss_info.loss * alpha)

예제 #7

0

파일 보기

    def after_train(self, training_info):
        """Adjust actor parameter according to KL-divergence."""
        exp_array = TracExperience(
            observation=training_info.info.observation,
            step_type=training_info.step_type,
            action_param=common.get_distribution_params(
                training_info.action_distribution),
            state=training_info.info.state)
        exp_array = common.create_and_unstack_tensor_array(
            exp_array, clear_after_read=False)
        dists, steps = self._trusted_updater.adjust_step(
            lambda: self._calc_change(exp_array), self._action_dist_clips)

        def _summarize():
            with self.name_scope:
                for i, d in enumerate(tf.nest.flatten(dists)):
                    tf.summary.scalar("unadjusted_action_dist/%s" % i, d)
                tf.summary.scalar("adjust_steps", steps)

        common.run_if(common.should_record_summaries(), _summarize)
        self._ac_algorithm.after_train(
            training_info._replace(info=training_info.info.ac))

예제 #8

0

파일 보기

def action_importance_ratio(action_distribution, collect_action_distribution,
                            action, action_spec, clipping_mode, scope,
                            importance_ratio_clipping, log_prob_clipping,
                            check_numerics, debug_summaries):
    """ ratio for importance sampling, used in PPO loss and vtrace loss.

        Caller has to save tf.name_scope() and pass scope to this function.

        Args:
            action_distribution (nested tf.distribution): Distribution over
                actions under target policy.
            collect_action_distribution (nested tf.distribution): distribution
                over actions from behavior policy, used to sample actions for
                the rollout.
            action (nested tf.distribution): possibly batched action tuple
                taken during rollout.
            action_spec (nested BoundedTensorSpec): representing the actions.
            clipping_mode (str): mode for clipping the importance ratio.
                'double_sided': clips the range of importance ratio into
                    [1-importance_ratio_clipping, 1+importance_ratio_clipping],
                    which is used by PPOLoss.
                'capping': clips the range of importance ratio into
                    min(1+importance_ratio_clipping, importance_ratio),
                    which is used by VTraceLoss, where c_bar or rho_bar =
                    1+importance_ratio_clipping.
            scope (name scope manager): returned by tf.name_scope(), set
                outside.
            importance_ratio_clipping (float):  Epsilon in clipped, surrogate
                PPO objective. See the cited paper for more detail.
            log_prob_clipping (float): If >0, clipping log probs to the range
                (-log_prob_clipping, log_prob_clipping) to prevent inf / NaN
                values.
            check_numerics (bool):  If true, adds tf.debugging.check_numerics to
                help find NaN / Inf values. For debugging only.
            debug_summaries (bool): If true, output summary metrics to tf.

        Returns:
            importance_ratio (Tensor), importance_ratio_clipped (Tensor).
    """
    current_policy_distribution = action_distribution

    sample_action_log_probs = tfa_common.log_probability(
        collect_action_distribution, action, action_spec)
    sample_action_log_probs = tf.stop_gradient(sample_action_log_probs)

    action_log_prob = tfa_common.log_probability(current_policy_distribution,
                                                 action, action_spec)
    if log_prob_clipping > 0.0:
        action_log_prob = tf.clip_by_value(action_log_prob, -log_prob_clipping,
                                           log_prob_clipping)
    if check_numerics:
        action_log_prob = tf.debugging.check_numerics(action_log_prob,
                                                      'action_log_prob')

    # Prepare both clipped and unclipped importance ratios.
    importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
    if check_numerics:
        importance_ratio = tf.debugging.check_numerics(importance_ratio,
                                                       'importance_ratio')

    if clipping_mode == 'double_sided':
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - importance_ratio_clipping,
            1 + importance_ratio_clipping)
    elif clipping_mode == 'capping':
        importance_ratio_clipped = tf.minimum(importance_ratio,
                                              1 + importance_ratio_clipping)
    else:
        raise Exception('Unsupported clipping mode: ' + clipping_mode)

    def _summary():
        with scope:
            if importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(input_tensor=tf.cast(
                    tf.greater(tf.abs(importance_ratio - 1.0),
                               importance_ratio_clipping), tf.float32))
                tf.summary.scalar('clip_fraction', clip_fraction)

            tf.summary.histogram('action_log_prob', action_log_prob)
            tf.summary.histogram('action_log_prob_sample',
                                 sample_action_log_probs)
            tf.summary.histogram('importance_ratio', importance_ratio)
            tf.summary.scalar('importance_ratio_mean',
                              tf.reduce_mean(input_tensor=importance_ratio))
            tf.summary.histogram('importance_ratio_clipped',
                                 importance_ratio_clipped)

    if debug_summaries:
        common.run_if(common.should_record_summaries(), _summary)

    return importance_ratio, importance_ratio_clipped

예제 #9

0

파일 보기

    def adjust_alpha(self, entropy):
        """Adjust alpha according to the current entropy.

        Args:
            entropy (scalar Tensor). the current entropy.
        Returns:
            adjusted entropy regularization
        """
        prev_avg_entropy = self._avg_entropy.get()
        avg_entropy = self._avg_entropy.average(entropy)

        def _init_entropy():
            self._max_entropy.assign(
                tf.minimum(0.8 * avg_entropy, avg_entropy / 0.8))
            self._stage.assign_add(1)

        def _init():
            below = avg_entropy < self._max_entropy
            increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32)
            # -1 * increasing + 0.5 * (1 - increasing)
            update_rate = (
                0.5 - 1.5 * increasing) * self._very_slow_update_rate
            self._stage.assign_add(tf.cast(below, tf.int32))
            self._log_alpha.assign(
                tf.maximum(self._log_alpha + update_rate,
                           np.float32(self._min_log_alpha)))

        def _free():
            crossing = avg_entropy < self._target_entropy
            self._stage.assign_add(tf.cast(crossing, tf.int32))

        def _adjust():
            previous_above = tf.cast(self._stage, tf.bool)
            above = avg_entropy > self._target_entropy
            self._stage.assign(tf.cast(above, tf.int32))
            crossing = above != previous_above
            update_rate = self._update_rate
            update_rate = tf.where(crossing, 0.9 * update_rate, update_rate)
            update_rate = tf.maximum(update_rate, self._slow_update_rate)
            update_rate = tf.where(entropy < self._fast_stage_thresh,
                                   np.float32(self._fast_update_rate),
                                   update_rate)
            self._update_rate.assign(update_rate)
            above = tf.cast(above, tf.float32)
            below = 1 - above
            increasing = tf.cast(avg_entropy > prev_avg_entropy, tf.float32)
            decreasing = 1 - increasing
            log_alpha = self._log_alpha + (
                (below + 0.5 * above) * decreasing -
                (above + 0.5 * below) * increasing) * update_rate
            log_alpha = tf.maximum(log_alpha, np.float32(self._min_log_alpha))
            self._log_alpha.assign(log_alpha)

        run_if(self._stage < -2, _init_entropy)
        run_if(self._stage == -2, _init)
        run_if(self._stage == -1, _free)
        run_if(self._stage >= 0, _adjust)
        alpha = tf.exp(self._log_alpha)

        def _summarize():
            with self.name_scope:
                tf.summary.scalar("alpha", alpha)
                tf.summary.scalar("avg_entropy", avg_entropy)
                tf.summary.scalar("stage", self._stage)
                tf.summary.scalar("update_rate", self._update_rate)

        if self._debug_summaries:
            run_if(should_record_summaries(), _summarize)

        return alpha