def train_step(self, distribution):
        """Train step.

        Args:
            distribution (nested Distribution): action distribution from the
                policy.
        Returns:
            AlgorithmStep. `info` field is LossInfo, other fields are empty.
        """
        entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
            distribution, self._action_spec)
        alpha_loss = self._log_alpha * tf.stop_gradient(entropy -
                                                        self._target_entropy)
        alpha = tf.stop_gradient(tf.exp(self._log_alpha))
        loss = alpha_loss
        entropy_loss = -entropy

        # Joint loss for optimizing alpha and entropy. The effect of alpha_loss
        # is to increase alpha when entropy is lower than target and decrease
        # alpha when entropy is larger than target. alpha * entropy_for_gradient
        # is to encourage higher action entropy.
        loss -= alpha * entropy_for_gradient

        return AlgorithmStep(
            outputs=(),
            state=(),
            info=LossInfo(
                loss,
                extra=EntropyTargetLossInfo(
                    alpha_loss=alpha_loss, entropy_loss=entropy_loss)))
Beispiel #2
0
    def _actor_train_step(self, exp: Experience, state: DdpgActorState):
        action, actor_state = self._actor_network(exp.observation,
                                                  exp.step_type,
                                                  network_state=state.actor)

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(action)
            q_value, critic_state = self._critic_network(
                (exp.observation, action), network_state=state.critic)

        dqda = tape.gradient(q_value, action)

        def actor_loss_fn(dqda, action):
            if self._dqda_clipping:
                dqda = tf.clip_by_value(dqda, -self._dqda_clipping,
                                        self._dqda_clipping)
            loss = 0.5 * losses.element_wise_squared_loss(
                tf.stop_gradient(dqda + action), action)
            loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape))))
            return loss

        actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action)
        state = DdpgActorState(actor=actor_state, critic=critic_state)
        info = LossInfo(loss=tf.add_n(tf.nest.flatten(actor_loss)),
                        extra=actor_loss)
        return PolicyStep(action=action, state=state, info=info)
    def calc_loss(self, training_info):
        if self._icm is not None:
            self.add_reward_summary("reward/intrinsic",
                                    training_info.info.icm_reward)

            training_info = training_info._replace(
                reward=self.calc_training_reward(training_info.reward,
                                                 training_info.info))

            self.add_reward_summary("reward/overall", training_info.reward)

        ac_loss = self._loss(training_info, training_info.info.value)
        loss = ac_loss.loss
        extra = ActorCriticAlgorithmLossInfo(ac=ac_loss.extra,
                                             icm=(),
                                             entropy_target=())

        if self._icm is not None:
            icm_loss = self._icm.calc_loss(training_info.info.icm_info)
            loss += icm_loss.loss
            extra = extra._replace(icm=icm_loss.extra)

        if self._entropy_target_algorithm:
            et_loss = self._entropy_target_algorithm.calc_loss(
                training_info.info.entropy_target_info)
            loss += et_loss.loss
            extra = extra._replace(entropy_target=et_loss.extra)

        return LossInfo(loss=loss, extra=extra)
Beispiel #4
0
 def calc_loss(self, training_info: TrainingInfo):
     critic_loss = self._calc_critic_loss(training_info)
     alpha_loss = training_info.info.alpha.loss
     actor_loss = training_info.info.actor.loss
     return LossInfo(loss=actor_loss.loss + critic_loss.loss +
                     alpha_loss.loss,
                     extra=SacLossInfo(actor=actor_loss.extra,
                                       critic=critic_loss.extra,
                                       alpha=alpha_loss.extra))
Beispiel #5
0
    def __call__(self, training_info: TrainingInfo, value):
        """Cacluate actor critic loss

        The first dimension of all the tensors is time dimension and the second
        dimesion is the batch dimension.

        Args:
            training_info (TrainingInfo): training_info collected by
                (On/Off)PolicyDriver. All tensors in training_info are time-major
            value (tf.Tensor): the time-major tensor for the value at each time
                step
            final_value (tf.Tensor): the value at one step ahead.
        Returns:
            loss_info (LossInfo): with loss_info.extra being ActorCriticLossInfo
        """

        returns, advantages = self._calc_returns_and_advantages(
            training_info, value)

        def _summary():
            with tf.name_scope('ActorCriticLoss'):
                tf.summary.scalar("values", tf.reduce_mean(value))
                tf.summary.scalar("returns", tf.reduce_mean(returns))
                tf.summary.scalar("advantages", tf.reduce_mean(advantages))
                tf.summary.histogram("advantages", advantages)
                tf.summary.scalar("explained_variance_of_return_by_value",
                                  common.explained_variance(value, returns))

        if self._debug_summaries:
            common.run_if(common.should_record_summaries(), _summary)

        if self._normalize_advantages:
            advantages = _normalize_advantages(advantages, axes=(0, 1))

        if self._advantage_clip:
            advantages = tf.clip_by_value(advantages, -self._advantage_clip,
                                          self._advantage_clip)

        pg_loss = self._pg_loss(training_info, tf.stop_gradient(advantages))

        td_loss = self._td_error_loss_fn(tf.stop_gradient(returns), value)

        loss = pg_loss + self._td_loss_weight * td_loss

        entropy_loss = ()
        if self._entropy_regularization is not None:
            entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
                training_info.action_distribution, self._action_spec)
            entropy_loss = -entropy
            loss -= self._entropy_regularization * entropy_for_gradient

        return LossInfo(
            loss,
            ActorCriticLossInfo(td_loss=td_loss,
                                pg_loss=pg_loss,
                                entropy_loss=entropy_loss))
Beispiel #6
0
    def calc_loss(self, training_info: TrainingInfo):
        """Calculate loss."""
        self.add_reward_summary("reward", training_info.reward)
        mbp_loss_info = self._mbp.calc_loss(training_info.info.mbp_info)
        mba_loss_info = self._mba.calc_loss(
            training_info._replace(info=training_info.info.mba_info))

        return LossInfo(loss=mbp_loss_info.loss + mba_loss_info.loss,
                        extra=MerlinLossInfo(mbp=mbp_loss_info.extra,
                                             mba=mba_loss_info.extra))
Beispiel #7
0
    def calc_loss(self, training_info: TrainingInfo):
        critic_loss = self._critic_loss(
            training_info=training_info,
            value=training_info.info.critic.q_value,
            target_value=training_info.info.critic.target_q_value)

        actor_loss = training_info.info.actor_loss

        return LossInfo(loss=critic_loss.loss + actor_loss.loss,
                        extra=DdpgLossInfo(critic=critic_loss.extra,
                                           actor=actor_loss.extra))
Beispiel #8
0
 def decode_step(self, latent_vector, observations):
     """Calculate decoding loss."""
     decoders = tf.nest.flatten(self._decoders)
     observations = tf.nest.flatten(observations)
     decoder_losses = [
         decoder.train_step((latent_vector, obs)).info
         for decoder, obs in zip(decoders, observations)
     ]
     loss = tf.add_n([decoder_loss.loss for decoder_loss in decoder_losses])
     decoder_losses = tf.nest.pack_sequence_as(self._decoders,
                                               decoder_losses)
     return LossInfo(loss=loss, extra=decoder_losses)
Beispiel #9
0
 def __call__(self, training_info: TrainingInfo, value, target_value):
     returns = value_ops.one_step_discounted_return(
         rewards=training_info.reward,
         values=target_value,
         step_types=training_info.step_type,
         discounts=training_info.discount * self._gamma)
     returns = common.tensor_extend(returns, value[-1])
     if self._debug_summaries:
         with tf.name_scope('OneStepTDLoss'):
             tf.summary.scalar("values", tf.reduce_mean(value))
             tf.summary.scalar("returns", tf.reduce_mean(returns))
     loss = self._td_error_loss_fn(tf.stop_gradient(returns), value)
     return LossInfo(loss=loss, extra=loss)
Beispiel #10
0
    def _train(  # pylint: disable=arguments-differ
            self,
            experience: NestedTensor,
            weights: Optional[Tensor] = None,
            **kwargs) -> LossInfo:
        """
        Train one or more of the models composing the environment model. Models need to be of a
        trainable type.

        :param experience: A batch of experience data in the form of a `Trajectory`.
            All tensors in `experience` must be shaped `[batch, time, ...]`.
        :param weights: Optional scalar or element-wise (per-batch-entry) importance
            weights. Not used at the moment.
        :param kwargs: A dictionary that contains a key with a string tensor value indicating
            which model should be trained.

        :return: A `LossInfo` tuples containing loss and info tensors of a trained model.
        """
        # TODO: TFAgent class has an error in _train method, missing kwargs, probably it will be
        # fixed in due time, until then we disable linting in def
        trainable_component_name = kwargs[TRAIN_ARGSPEC_COMPONENT_ID].numpy()

        self._train_step_counter.assign_add(1)

        if trainable_component_name not in self._trainable_components:
            warn(
                f"Trainable component {trainable_component_name} name not in trainable components"
                f" {self._trainable_components}, no train!",
                RuntimeWarning,
            )
            return LossInfo(None, None)

        model, model_training_spec = self._trainable_components[
            trainable_component_name]
        if model_training_spec is not None:
            history = model.train(experience, model_training_spec)
            return LossInfo(history.history["loss"], None)
        else:
            return model.train(experience)
Beispiel #11
0
    def train_step(self, inputs, state=None):
        """Perform training on one batch of inputs.

        Args:
            inputs (tuple(Tensor, Tensor)): tuple of x and y
            state: not used
        Returns:
            AlgorithmStep
                outputs (Tensor): shape=[batch_size], its mean is the estimated
                    MI
                state: not used
                info (LossInfo): info.loss is the loss
        """
        x, y = inputs
        num_outer_dims = get_outer_rank(x, self._x_spec)
        batch_squash = BatchSquash(num_outer_dims)
        x = batch_squash.flatten(x)
        y = batch_squash.flatten(y)
        x1, y1 = self._sampler(x, y)

        log_ratio = self._model([x, y])[0]
        t1 = self._model([x1, y1])[0]

        if self._type == 'DV':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mean = tf.stop_gradient(tf.reduce_mean(ratio))
            if self._mean_averager:
                self._mean_averager.update(mean)
                unbiased_mean = tf.stop_gradient(self._mean_averager.get())
            else:
                unbiased_mean = mean
            # estimated MI = reduce_mean(mi)
            # ratio/mean-1 does not contribute to the final estimated MI, since
            # mean(ratio/mean-1) = 0. We add it so that we can have an estimation
            # of the variance of the MI estimator
            mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1)
            loss = ratio / unbiased_mean - log_ratio
        elif self._type == 'KLD':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mi = log_ratio - ratio + 1
            loss = -mi
        elif self._type == 'JSD':
            mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4)
            loss = -mi

        mi = batch_squash.unflatten(mi)
        loss = batch_squash.unflatten(loss)

        return AlgorithmStep(outputs=mi,
                             state=(),
                             info=LossInfo(loss, extra=()))
Beispiel #12
0
    def _calc_critic_loss(self, training_info):
        critic_info = training_info.info.critic

        target_critic = critic_info.target_critic

        critic_loss1 = self._critic_loss(training_info=training_info,
                                         value=critic_info.critic1,
                                         target_value=target_critic)

        critic_loss2 = self._critic_loss(training_info=training_info,
                                         value=critic_info.critic2,
                                         target_value=target_critic)

        critic_loss = critic_loss1.loss + critic_loss2.loss
        return LossInfo(loss=critic_loss, extra=critic_loss)
Beispiel #13
0
    def train_step(self, inputs, state: MBPState):
        """Train one step.

        Args:
            inputs (tuple): a tuple of (observation, action)
        """
        observation, _ = inputs
        latent_vector, kld, next_state = self.encode_step(inputs, state)

        # TODO: decoder for action
        decoder_loss = self.decode_step(latent_vector, observation)

        return AlgorithmStep(
            outputs=latent_vector,
            state=next_state,
            info=LossInfo(loss=self._loss_weight * (decoder_loss.loss + kld),
                          extra=MBPLossInfo(decoder=decoder_loss, vae=kld)))
Beispiel #14
0
    def train_step(self, inputs, state=None):
        """Train one step.

        Args:
            inputs (tuple): tuple of (inputs, target)
            state (nested Tensor): network state for `decoder`

        Returns:
            AlgorithmStep with the following fields:
            outputs: decoding result
            state: rnn state
            info: loss of decoding

        """
        input, target = inputs
        pred, state = self._decoder(input, network_state=state)
        assert pred.shape == target.shape
        loss = self._loss(target, pred)
        return AlgorithmStep(outputs=pred,
                             state=state,
                             info=LossInfo(self._loss_weight * loss, extra=()))
    def _train_model_free_agent(self, experience: NestedTensor) -> LossInfo:
        """
        Train the model-free agent virtually for multiple iterations.
        :param experience: A batch of experience data in the form of a `Trajectory`.
            All tensors in `experience` must be shaped `[batch, time, ...]`. Importantly,
            this is real-world experience and the agent needs to decide how to leverage this
            real-world experience for virtual training using the environment model.
        :return: A `LossInfo` tuples containing loss and info tensors of a trained model.
        """
        assert tf.keras.backend.ndim(experience.observation) >= 3
        assert experience.observation.shape[0] == 1, "The real environment has batch size 1."

        mask = ~experience.is_boundary()  # [batch, time, ...]
        masked_observation = tf.boolean_mask(
            experience.observation, mask
        )  # [reduced batch, ...]

        model_free_losses = []
        for _ in range(self._model_free_training_iterations):
            random_indexes = tf.random.uniform(
                shape=(self._environment_model.batch_size,),
                maxval=masked_observation.shape[0],
                dtype=tf.int32,
            )
            initial_observation = tf.gather(
                masked_observation, random_indexes
            )  # [env model batch, ...]

            initial_time_step = self._environment_model.set_initial_observation(
                initial_observation
            )
            self._virtual_rollouts_driver.run(initial_time_step)

            policy_experience = self._virtual_rollouts_replay_buffer.gather_all()
            model_free_losses.append(self._model_free_agent.train(policy_experience))

            self._virtual_rollouts_replay_buffer.clear()

        loss_info = LossInfo(loss=model_free_losses[0].loss, extra=model_free_losses)
        return loss_info
Beispiel #16
0
    def train_step(self, inputs, state):
        """
        Args:
            inputs (tuple): observation and previous action
        Returns:
            TrainStep:
                outputs: intrinsic reward
                state:
                info:
        """
        feature, prev_action = inputs
        if self._encoding_net is not None:
            feature, _ = self._encoding_net(feature)
        prev_feature = state
        prev_action = self._encode_action(prev_action)

        forward_pred, _ = self._forward_net(
            inputs=[tf.stop_gradient(prev_feature), prev_action])
        forward_loss = 0.5 * tf.reduce_mean(
            tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1)

        action_pred, _ = self._inverse_net(inputs=[prev_feature, feature])

        if tensor_spec.is_discrete(self._action_spec):
            inverse_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=prev_action, logits=action_pred)
        else:
            inverse_loss = 0.5 * tf.reduce_mean(
                tf.square(prev_action - action_pred), axis=-1)

        intrinsic_reward = tf.stop_gradient(forward_loss)
        intrinsic_reward = self._reward_normalizer.normalize(intrinsic_reward)

        return AlgorithmStep(outputs=intrinsic_reward,
                             state=feature,
                             info=LossInfo(loss=forward_loss + inverse_loss,
                                           extra=ICMLossInfo(
                                               forward_loss=forward_loss,
                                               inverse_loss=inverse_loss)))
Beispiel #17
0
  def _train(self, experience, weights):
    """Modifies the default _train step in two ways.

      1. Passes actions and next time steps to actor loss.
      2. Clips the dual parameter.

    Args:
      experience: A time-stacked trajectory object.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.

    Returns:
      A train_op.
    """
    transition = self._as_transition(experience)
    time_steps, policy_steps, next_time_steps = transition
    actions = policy_steps.action

    trainable_critic_variables = list(object_identity.ObjectIdentitySet(
        self._critic_network_1.trainable_variables +
        self._critic_network_2.trainable_variables))

    tf.debugging.check_numerics(
        tf.reduce_mean(time_steps.reward), 'ts.reward is inf or nan.')
    tf.debugging.check_numerics(
        tf.reduce_mean(next_time_steps.reward), 'next_ts.reward is inf or nan.')
    tf.debugging.check_numerics(
        tf.reduce_mean(actions), 'Actions is inf or nan.')

    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert trainable_critic_variables, ('No trainable critic variables to '
                                          'optimize.')
      tape.watch(trainable_critic_variables)
      critic_loss = self._critic_loss_weight*self.critic_loss(
          time_steps,
          actions,
          next_time_steps,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights,
          training=True)

    tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.')
    critic_grads = tape.gradient(critic_loss, trainable_critic_variables)
    self._apply_gradients(critic_grads, trainable_critic_variables,
                          self._critic_optimizer)

    trainable_actor_variables = self._actor_network.trainable_variables
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert trainable_actor_variables, ('No trainable actor variables to '
                                         'optimize.')
      tape.watch(trainable_actor_variables)
      actor_loss = self._actor_loss_weight*self.actor_loss(
          time_steps, actions, next_time_steps, weights=weights)
    tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.')
    actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
    self._apply_gradients(actor_grads, trainable_actor_variables,
                          self._actor_optimizer)

    alpha_variable = [self._log_alpha]
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      assert alpha_variable, 'No alpha variable to optimize.'
      tape.watch(alpha_variable)
      alpha_loss = self._alpha_loss_weight*self.alpha_loss(
          time_steps, weights=weights)
    tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.')
    alpha_grads = tape.gradient(alpha_loss, alpha_variable)
    self._apply_gradients(alpha_grads, alpha_variable, self._alpha_optimizer)

    with tf.name_scope('Losses'):
      tf.compat.v2.summary.scalar(
          name='critic_loss', data=critic_loss, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='actor_loss', data=actor_loss, step=self.train_step_counter)
      tf.compat.v2.summary.scalar(
          name='alpha_loss', data=alpha_loss, step=self.train_step_counter)

    self.train_step_counter.assign_add(1)
    self._update_target()

    total_loss = critic_loss + actor_loss + alpha_loss

    extra = sac_agent.SacLossInfo(
        critic_loss=critic_loss, actor_loss=actor_loss, alpha_loss=alpha_loss)

    return LossInfo(loss=total_loss, extra=extra)
Beispiel #18
0
 def train_model_free_agent_step() -> LossInfo:
     if not self._has_transition_model_been_trained:
         return LossInfo(None, None)
     trajectory = replay_buffer.gather_all()
     return agent.train(trajectory,
                        **train_model_free_agent_kwargs_dict)
Beispiel #19
0
 def train_step():
     if (tf.data.experimental.cardinality(dataset).numpy() >=
             self._training_data_batch_size):
         experience, _ = next(iterator)
         return agent.train(experience)
     return LossInfo(None, None)
Beispiel #20
0
 def __call__(self) -> LossInfo:
     return LossInfo(0.0, extra=self._identifier)
Beispiel #21
0
 def _alpha_train_step(self, log_pi):
     alpha_loss = self._log_alpha * tf.stop_gradient(-log_pi -
                                                     self._target_entropy)
     info = SacAlphaInfo(loss=LossInfo(loss=alpha_loss, extra=alpha_loss))
     return info
Beispiel #22
0
    def _actor_train_step(self, exp: Experience, state: SacActorState,
                          action_distribution, action, log_pi):

        if self._is_continuous:
            critic_input = (exp.observation, action)

            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(action)
                critic1, critic1_state = self._critic_network1(
                    critic_input,
                    step_type=exp.step_type,
                    network_state=state.critic1)

                critic2, critic2_state = self._critic_network2(
                    critic_input,
                    step_type=exp.step_type,
                    network_state=state.critic2)

                target_q_value = tf.minimum(critic1, critic2)

            dqda = tape.gradient(target_q_value, action)

            def actor_loss_fn(dqda, action):
                if self._dqda_clipping:
                    dqda = tf.clip_by_value(dqda, -self._dqda_clipping,
                                            self._dqda_clipping)
                loss = 0.5 * losses.element_wise_squared_loss(
                    tf.stop_gradient(dqda + action), action)
                loss = tf.reduce_sum(loss,
                                     axis=list(range(1, len(loss.shape))))
                return loss

            actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action)
            alpha = tf.stop_gradient(tf.exp(self._log_alpha))
            actor_loss += alpha * log_pi
        else:
            critic1, critic1_state = self._critic_network1(
                exp.observation,
                step_type=exp.step_type,
                network_state=state.critic1)

            critic2, critic2_state = self._critic_network2(
                exp.observation,
                step_type=exp.step_type,
                network_state=state.critic2)

            assert isinstance(
                action_distribution, tfp.distributions.Categorical), \
                "Only `tfp.distributions.Categorical` was supported, received:" + str(type(action_distribution))

            action_probs = action_distribution.probs
            log_action_probs = tf.math.log(action_probs + 1e-8)

            target_q_value = tf.stop_gradient(tf.minimum(critic1, critic2))
            alpha = tf.stop_gradient(tf.exp(self._log_alpha))
            actor_loss = tf.reduce_mean(
                action_probs * (alpha * log_action_probs - target_q_value),
                axis=-1)

        state = SacActorState(critic1=critic1_state, critic2=critic2_state)
        info = SacActorInfo(loss=LossInfo(loss=actor_loss, extra=actor_loss))
        return state, info
Beispiel #23
0
 def _none_returning_train_step():
     return LossInfo(loss=None, extra=None)