def train_step(self, distribution): """Train step. Args: distribution (nested Distribution): action distribution from the policy. Returns: AlgorithmStep. `info` field is LossInfo, other fields are empty. """ entropy, entropy_for_gradient = dist_utils.entropy_with_fallback( distribution, self._action_spec) alpha_loss = self._log_alpha * tf.stop_gradient(entropy - self._target_entropy) alpha = tf.stop_gradient(tf.exp(self._log_alpha)) loss = alpha_loss entropy_loss = -entropy # Joint loss for optimizing alpha and entropy. The effect of alpha_loss # is to increase alpha when entropy is lower than target and decrease # alpha when entropy is larger than target. alpha * entropy_for_gradient # is to encourage higher action entropy. loss -= alpha * entropy_for_gradient return AlgorithmStep( outputs=(), state=(), info=LossInfo( loss, extra=EntropyTargetLossInfo( alpha_loss=alpha_loss, entropy_loss=entropy_loss)))
def _actor_train_step(self, exp: Experience, state: DdpgActorState): action, actor_state = self._actor_network(exp.observation, exp.step_type, network_state=state.actor) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(action) q_value, critic_state = self._critic_network( (exp.observation, action), network_state=state.critic) dqda = tape.gradient(q_value, action) def actor_loss_fn(dqda, action): if self._dqda_clipping: dqda = tf.clip_by_value(dqda, -self._dqda_clipping, self._dqda_clipping) loss = 0.5 * losses.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape)))) return loss actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action) state = DdpgActorState(actor=actor_state, critic=critic_state) info = LossInfo(loss=tf.add_n(tf.nest.flatten(actor_loss)), extra=actor_loss) return PolicyStep(action=action, state=state, info=info)
def calc_loss(self, training_info): if self._icm is not None: self.add_reward_summary("reward/intrinsic", training_info.info.icm_reward) training_info = training_info._replace( reward=self.calc_training_reward(training_info.reward, training_info.info)) self.add_reward_summary("reward/overall", training_info.reward) ac_loss = self._loss(training_info, training_info.info.value) loss = ac_loss.loss extra = ActorCriticAlgorithmLossInfo(ac=ac_loss.extra, icm=(), entropy_target=()) if self._icm is not None: icm_loss = self._icm.calc_loss(training_info.info.icm_info) loss += icm_loss.loss extra = extra._replace(icm=icm_loss.extra) if self._entropy_target_algorithm: et_loss = self._entropy_target_algorithm.calc_loss( training_info.info.entropy_target_info) loss += et_loss.loss extra = extra._replace(entropy_target=et_loss.extra) return LossInfo(loss=loss, extra=extra)
def calc_loss(self, training_info: TrainingInfo): critic_loss = self._calc_critic_loss(training_info) alpha_loss = training_info.info.alpha.loss actor_loss = training_info.info.actor.loss return LossInfo(loss=actor_loss.loss + critic_loss.loss + alpha_loss.loss, extra=SacLossInfo(actor=actor_loss.extra, critic=critic_loss.extra, alpha=alpha_loss.extra))
def __call__(self, training_info: TrainingInfo, value): """Cacluate actor critic loss The first dimension of all the tensors is time dimension and the second dimesion is the batch dimension. Args: training_info (TrainingInfo): training_info collected by (On/Off)PolicyDriver. All tensors in training_info are time-major value (tf.Tensor): the time-major tensor for the value at each time step final_value (tf.Tensor): the value at one step ahead. Returns: loss_info (LossInfo): with loss_info.extra being ActorCriticLossInfo """ returns, advantages = self._calc_returns_and_advantages( training_info, value) def _summary(): with tf.name_scope('ActorCriticLoss'): tf.summary.scalar("values", tf.reduce_mean(value)) tf.summary.scalar("returns", tf.reduce_mean(returns)) tf.summary.scalar("advantages", tf.reduce_mean(advantages)) tf.summary.histogram("advantages", advantages) tf.summary.scalar("explained_variance_of_return_by_value", common.explained_variance(value, returns)) if self._debug_summaries: common.run_if(common.should_record_summaries(), _summary) if self._normalize_advantages: advantages = _normalize_advantages(advantages, axes=(0, 1)) if self._advantage_clip: advantages = tf.clip_by_value(advantages, -self._advantage_clip, self._advantage_clip) pg_loss = self._pg_loss(training_info, tf.stop_gradient(advantages)) td_loss = self._td_error_loss_fn(tf.stop_gradient(returns), value) loss = pg_loss + self._td_loss_weight * td_loss entropy_loss = () if self._entropy_regularization is not None: entropy, entropy_for_gradient = dist_utils.entropy_with_fallback( training_info.action_distribution, self._action_spec) entropy_loss = -entropy loss -= self._entropy_regularization * entropy_for_gradient return LossInfo( loss, ActorCriticLossInfo(td_loss=td_loss, pg_loss=pg_loss, entropy_loss=entropy_loss))
def calc_loss(self, training_info: TrainingInfo): """Calculate loss.""" self.add_reward_summary("reward", training_info.reward) mbp_loss_info = self._mbp.calc_loss(training_info.info.mbp_info) mba_loss_info = self._mba.calc_loss( training_info._replace(info=training_info.info.mba_info)) return LossInfo(loss=mbp_loss_info.loss + mba_loss_info.loss, extra=MerlinLossInfo(mbp=mbp_loss_info.extra, mba=mba_loss_info.extra))
def calc_loss(self, training_info: TrainingInfo): critic_loss = self._critic_loss( training_info=training_info, value=training_info.info.critic.q_value, target_value=training_info.info.critic.target_q_value) actor_loss = training_info.info.actor_loss return LossInfo(loss=critic_loss.loss + actor_loss.loss, extra=DdpgLossInfo(critic=critic_loss.extra, actor=actor_loss.extra))
def decode_step(self, latent_vector, observations): """Calculate decoding loss.""" decoders = tf.nest.flatten(self._decoders) observations = tf.nest.flatten(observations) decoder_losses = [ decoder.train_step((latent_vector, obs)).info for decoder, obs in zip(decoders, observations) ] loss = tf.add_n([decoder_loss.loss for decoder_loss in decoder_losses]) decoder_losses = tf.nest.pack_sequence_as(self._decoders, decoder_losses) return LossInfo(loss=loss, extra=decoder_losses)
def __call__(self, training_info: TrainingInfo, value, target_value): returns = value_ops.one_step_discounted_return( rewards=training_info.reward, values=target_value, step_types=training_info.step_type, discounts=training_info.discount * self._gamma) returns = common.tensor_extend(returns, value[-1]) if self._debug_summaries: with tf.name_scope('OneStepTDLoss'): tf.summary.scalar("values", tf.reduce_mean(value)) tf.summary.scalar("returns", tf.reduce_mean(returns)) loss = self._td_error_loss_fn(tf.stop_gradient(returns), value) return LossInfo(loss=loss, extra=loss)
def _train( # pylint: disable=arguments-differ self, experience: NestedTensor, weights: Optional[Tensor] = None, **kwargs) -> LossInfo: """ Train one or more of the models composing the environment model. Models need to be of a trainable type. :param experience: A batch of experience data in the form of a `Trajectory`. All tensors in `experience` must be shaped `[batch, time, ...]`. :param weights: Optional scalar or element-wise (per-batch-entry) importance weights. Not used at the moment. :param kwargs: A dictionary that contains a key with a string tensor value indicating which model should be trained. :return: A `LossInfo` tuples containing loss and info tensors of a trained model. """ # TODO: TFAgent class has an error in _train method, missing kwargs, probably it will be # fixed in due time, until then we disable linting in def trainable_component_name = kwargs[TRAIN_ARGSPEC_COMPONENT_ID].numpy() self._train_step_counter.assign_add(1) if trainable_component_name not in self._trainable_components: warn( f"Trainable component {trainable_component_name} name not in trainable components" f" {self._trainable_components}, no train!", RuntimeWarning, ) return LossInfo(None, None) model, model_training_spec = self._trainable_components[ trainable_component_name] if model_training_spec is not None: history = model.train(experience, model_training_spec) return LossInfo(history.history["loss"], None) else: return model.train(experience)
def train_step(self, inputs, state=None): """Perform training on one batch of inputs. Args: inputs (tuple(Tensor, Tensor)): tuple of x and y state: not used Returns: AlgorithmStep outputs (Tensor): shape=[batch_size], its mean is the estimated MI state: not used info (LossInfo): info.loss is the loss """ x, y = inputs num_outer_dims = get_outer_rank(x, self._x_spec) batch_squash = BatchSquash(num_outer_dims) x = batch_squash.flatten(x) y = batch_squash.flatten(y) x1, y1 = self._sampler(x, y) log_ratio = self._model([x, y])[0] t1 = self._model([x1, y1])[0] if self._type == 'DV': ratio = tf.math.exp(tf.minimum(t1, 20)) mean = tf.stop_gradient(tf.reduce_mean(ratio)) if self._mean_averager: self._mean_averager.update(mean) unbiased_mean = tf.stop_gradient(self._mean_averager.get()) else: unbiased_mean = mean # estimated MI = reduce_mean(mi) # ratio/mean-1 does not contribute to the final estimated MI, since # mean(ratio/mean-1) = 0. We add it so that we can have an estimation # of the variance of the MI estimator mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1) loss = ratio / unbiased_mean - log_ratio elif self._type == 'KLD': ratio = tf.math.exp(tf.minimum(t1, 20)) mi = log_ratio - ratio + 1 loss = -mi elif self._type == 'JSD': mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4) loss = -mi mi = batch_squash.unflatten(mi) loss = batch_squash.unflatten(loss) return AlgorithmStep(outputs=mi, state=(), info=LossInfo(loss, extra=()))
def _calc_critic_loss(self, training_info): critic_info = training_info.info.critic target_critic = critic_info.target_critic critic_loss1 = self._critic_loss(training_info=training_info, value=critic_info.critic1, target_value=target_critic) critic_loss2 = self._critic_loss(training_info=training_info, value=critic_info.critic2, target_value=target_critic) critic_loss = critic_loss1.loss + critic_loss2.loss return LossInfo(loss=critic_loss, extra=critic_loss)
def train_step(self, inputs, state: MBPState): """Train one step. Args: inputs (tuple): a tuple of (observation, action) """ observation, _ = inputs latent_vector, kld, next_state = self.encode_step(inputs, state) # TODO: decoder for action decoder_loss = self.decode_step(latent_vector, observation) return AlgorithmStep( outputs=latent_vector, state=next_state, info=LossInfo(loss=self._loss_weight * (decoder_loss.loss + kld), extra=MBPLossInfo(decoder=decoder_loss, vae=kld)))
def train_step(self, inputs, state=None): """Train one step. Args: inputs (tuple): tuple of (inputs, target) state (nested Tensor): network state for `decoder` Returns: AlgorithmStep with the following fields: outputs: decoding result state: rnn state info: loss of decoding """ input, target = inputs pred, state = self._decoder(input, network_state=state) assert pred.shape == target.shape loss = self._loss(target, pred) return AlgorithmStep(outputs=pred, state=state, info=LossInfo(self._loss_weight * loss, extra=()))
def _train_model_free_agent(self, experience: NestedTensor) -> LossInfo: """ Train the model-free agent virtually for multiple iterations. :param experience: A batch of experience data in the form of a `Trajectory`. All tensors in `experience` must be shaped `[batch, time, ...]`. Importantly, this is real-world experience and the agent needs to decide how to leverage this real-world experience for virtual training using the environment model. :return: A `LossInfo` tuples containing loss and info tensors of a trained model. """ assert tf.keras.backend.ndim(experience.observation) >= 3 assert experience.observation.shape[0] == 1, "The real environment has batch size 1." mask = ~experience.is_boundary() # [batch, time, ...] masked_observation = tf.boolean_mask( experience.observation, mask ) # [reduced batch, ...] model_free_losses = [] for _ in range(self._model_free_training_iterations): random_indexes = tf.random.uniform( shape=(self._environment_model.batch_size,), maxval=masked_observation.shape[0], dtype=tf.int32, ) initial_observation = tf.gather( masked_observation, random_indexes ) # [env model batch, ...] initial_time_step = self._environment_model.set_initial_observation( initial_observation ) self._virtual_rollouts_driver.run(initial_time_step) policy_experience = self._virtual_rollouts_replay_buffer.gather_all() model_free_losses.append(self._model_free_agent.train(policy_experience)) self._virtual_rollouts_replay_buffer.clear() loss_info = LossInfo(loss=model_free_losses[0].loss, extra=model_free_losses) return loss_info
def train_step(self, inputs, state): """ Args: inputs (tuple): observation and previous action Returns: TrainStep: outputs: intrinsic reward state: info: """ feature, prev_action = inputs if self._encoding_net is not None: feature, _ = self._encoding_net(feature) prev_feature = state prev_action = self._encode_action(prev_action) forward_pred, _ = self._forward_net( inputs=[tf.stop_gradient(prev_feature), prev_action]) forward_loss = 0.5 * tf.reduce_mean( tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1) action_pred, _ = self._inverse_net(inputs=[prev_feature, feature]) if tensor_spec.is_discrete(self._action_spec): inverse_loss = tf.nn.softmax_cross_entropy_with_logits( labels=prev_action, logits=action_pred) else: inverse_loss = 0.5 * tf.reduce_mean( tf.square(prev_action - action_pred), axis=-1) intrinsic_reward = tf.stop_gradient(forward_loss) intrinsic_reward = self._reward_normalizer.normalize(intrinsic_reward) return AlgorithmStep(outputs=intrinsic_reward, state=feature, info=LossInfo(loss=forward_loss + inverse_loss, extra=ICMLossInfo( forward_loss=forward_loss, inverse_loss=inverse_loss)))
def _train(self, experience, weights): """Modifies the default _train step in two ways. 1. Passes actions and next time steps to actor loss. 2. Clips the dual parameter. Args: experience: A time-stacked trajectory object. weights: Optional scalar or elementwise (per-batch-entry) importance weights. Returns: A train_op. """ transition = self._as_transition(experience) time_steps, policy_steps, next_time_steps = transition actions = policy_steps.action trainable_critic_variables = list(object_identity.ObjectIdentitySet( self._critic_network_1.trainable_variables + self._critic_network_2.trainable_variables)) tf.debugging.check_numerics( tf.reduce_mean(time_steps.reward), 'ts.reward is inf or nan.') tf.debugging.check_numerics( tf.reduce_mean(next_time_steps.reward), 'next_ts.reward is inf or nan.') tf.debugging.check_numerics( tf.reduce_mean(actions), 'Actions is inf or nan.') with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_variables, ('No trainable critic variables to ' 'optimize.') tape.watch(trainable_critic_variables) critic_loss = self._critic_loss_weight*self.critic_loss( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.') critic_grads = tape.gradient(critic_loss, trainable_critic_variables) self._apply_gradients(critic_grads, trainable_critic_variables, self._critic_optimizer) trainable_actor_variables = self._actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_actor_variables, ('No trainable actor variables to ' 'optimize.') tape.watch(trainable_actor_variables) actor_loss = self._actor_loss_weight*self.actor_loss( time_steps, actions, next_time_steps, weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, trainable_actor_variables) self._apply_gradients(actor_grads, trainable_actor_variables, self._actor_optimizer) alpha_variable = [self._log_alpha] with tf.GradientTape(watch_accessed_variables=False) as tape: assert alpha_variable, 'No alpha variable to optimize.' tape.watch(alpha_variable) alpha_loss = self._alpha_loss_weight*self.alpha_loss( time_steps, weights=weights) tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.') alpha_grads = tape.gradient(alpha_loss, alpha_variable) self._apply_gradients(alpha_grads, alpha_variable, self._alpha_optimizer) with tf.name_scope('Losses'): tf.compat.v2.summary.scalar( name='critic_loss', data=critic_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='actor_loss', data=actor_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='alpha_loss', data=alpha_loss, step=self.train_step_counter) self.train_step_counter.assign_add(1) self._update_target() total_loss = critic_loss + actor_loss + alpha_loss extra = sac_agent.SacLossInfo( critic_loss=critic_loss, actor_loss=actor_loss, alpha_loss=alpha_loss) return LossInfo(loss=total_loss, extra=extra)
def train_model_free_agent_step() -> LossInfo: if not self._has_transition_model_been_trained: return LossInfo(None, None) trajectory = replay_buffer.gather_all() return agent.train(trajectory, **train_model_free_agent_kwargs_dict)
def train_step(): if (tf.data.experimental.cardinality(dataset).numpy() >= self._training_data_batch_size): experience, _ = next(iterator) return agent.train(experience) return LossInfo(None, None)
def __call__(self) -> LossInfo: return LossInfo(0.0, extra=self._identifier)
def _alpha_train_step(self, log_pi): alpha_loss = self._log_alpha * tf.stop_gradient(-log_pi - self._target_entropy) info = SacAlphaInfo(loss=LossInfo(loss=alpha_loss, extra=alpha_loss)) return info
def _actor_train_step(self, exp: Experience, state: SacActorState, action_distribution, action, log_pi): if self._is_continuous: critic_input = (exp.observation, action) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(action) critic1, critic1_state = self._critic_network1( critic_input, step_type=exp.step_type, network_state=state.critic1) critic2, critic2_state = self._critic_network2( critic_input, step_type=exp.step_type, network_state=state.critic2) target_q_value = tf.minimum(critic1, critic2) dqda = tape.gradient(target_q_value, action) def actor_loss_fn(dqda, action): if self._dqda_clipping: dqda = tf.clip_by_value(dqda, -self._dqda_clipping, self._dqda_clipping) loss = 0.5 * losses.element_wise_squared_loss( tf.stop_gradient(dqda + action), action) loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape)))) return loss actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action) alpha = tf.stop_gradient(tf.exp(self._log_alpha)) actor_loss += alpha * log_pi else: critic1, critic1_state = self._critic_network1( exp.observation, step_type=exp.step_type, network_state=state.critic1) critic2, critic2_state = self._critic_network2( exp.observation, step_type=exp.step_type, network_state=state.critic2) assert isinstance( action_distribution, tfp.distributions.Categorical), \ "Only `tfp.distributions.Categorical` was supported, received:" + str(type(action_distribution)) action_probs = action_distribution.probs log_action_probs = tf.math.log(action_probs + 1e-8) target_q_value = tf.stop_gradient(tf.minimum(critic1, critic2)) alpha = tf.stop_gradient(tf.exp(self._log_alpha)) actor_loss = tf.reduce_mean( action_probs * (alpha * log_action_probs - target_q_value), axis=-1) state = SacActorState(critic1=critic1_state, critic2=critic2_state) info = SacActorInfo(loss=LossInfo(loss=actor_loss, extra=actor_loss)) return state, info
def _none_returning_train_step(): return LossInfo(loss=None, extra=None)