def train_step(self, inputs, state=None): """Train one step. Args: inputs (tuple): tuple of (inputs, target) state (nested Tensor): network state for `decoder` Returns: AlgorithmStep with the following fields: outputs: decoding result state: rnn state info: loss of decoding """ input, target = inputs pred, state = self._decoder(input, network_state=state) assert pred.shape == target.shape loss = self._loss(target, pred) if len(loss.shape) > 1: # reduce to (B,) reduce_dims = list(range(1, len(loss.shape))) loss = tf.reduce_sum(loss, axis=reduce_dims) return AlgorithmStep(outputs=pred, state=state, info=LossInfo(loss=self._loss_weight * loss))
def train_step(self, inputs, loss_func, batch_size=None, state=None): """ Args: inputs (nested Tensor): if None, the outputs is generated only from noise. loss_func (Callable): loss_func([outputs, inputs]) (loss_func(outputs) if inputs is None) returns a Tensor with shape [batch_size] as a loss for optimizing the generator batch_size (int): batch_size. Must be provided if inputs is None. Its is ignored if inputs is not None state: not used Returns: AlgorithmStep: outputs: Tensor with shape (batch_size, dim) info: LossInfo """ outputs, gen_inputs = self._predict(inputs, batch_size) loss, grad = self._grad_func(inputs, outputs, loss_func) loss_propagated = tf.reduce_sum( tf.stop_gradient(grad) * outputs, axis=-1) mi_loss = () if self._mi_estimator is not None: mi_step = self._mi_estimator.train_step([gen_inputs, outputs]) mi_loss = mi_step.info.loss loss_propagated += self._mi_weight * mi_loss return AlgorithmStep( outputs=outputs, state=(), info=LossInfo( loss=loss_propagated, extra=GeneratorLossInfo(generator=loss, mi_estimator=mi_loss)))
def train_step(self, distribution): """Train step. Args: distribution (nested Distribution): action distribution from the policy. Returns: AlgorithmStep. `info` field is LossInfo, other fields are empty. """ entropy, entropy_for_gradient = dist_utils.entropy_with_fallback( distribution, self._action_spec) alpha_loss = self._log_alpha * tf.stop_gradient(entropy - self._target_entropy) alpha = tf.stop_gradient(tf.exp(self._log_alpha)) loss = alpha_loss entropy_loss = -entropy # Joint loss for optimizing alpha and entropy. The effect of alpha_loss # is to increase alpha when entropy is lower than target and decrease # alpha when entropy is larger than target. alpha * entropy_for_gradient # is to encourage higher action entropy. loss -= alpha * entropy_for_gradient return AlgorithmStep( outputs=(), state=(), info=LossInfo( loss, extra=EntropyTargetLossInfo( alpha_loss=alpha_loss, entropy_loss=entropy_loss)))
def train_step(self, inputs, state, calc_intrinsic_reward=True): """ Args: inputs (tuple): observation state (tuple): empty tuple () calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: empty tuple () info: RNDInfo """ observation, _ = inputs if self._observation_normalizer is not None: observation = self._observation_normalizer.normalize(observation) pred_embedding, _ = self._predictor_net(observation) target_embedding, _ = self._target_net(observation) loss = 0.5 * tf.reduce_mean( tf.square(pred_embedding - tf.stop_gradient(target_embedding)), axis=-1) intrinsic_reward = () if calc_intrinsic_reward: intrinsic_reward = tf.stop_gradient(loss) intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward) return AlgorithmStep(outputs=(), state=(), info=RNDInfo(reward=intrinsic_reward, loss=LossInfo(loss=loss)))
def train_step(self, time_step: ActionTimeStep, state, calc_intrinsic_reward=True): """ Args: time_step (ActionTimeStep): input time_step data state (tuple): state for MISC (previous observation, previous previous action) calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: tuple of observation and previous action info: (MISCInfo): """ feature = time_step.observation prev_action = time_step.prev_action feature = tf.concat([feature_state, prev_action], axis=-1) prev_feature = tf.concat(state, axis=-1) feature_reshaped = tf.expand_dims(feature, axis=1) prev_feature_reshaped = tf.expand_dims(prev_feature, axis=1) feature_pair = tf.concat([prev_feature_reshaped, feature_reshaped], 1) feature_reshaped_tran = transpose2(feature_reshaped, 1, 0) def add_batch(): self._buffer.add_batch(feature_reshaped_tran) if calc_intrinsic_reward: add_batch() if self._n_objects < 2: obs_tau_excludes_goal, obs_tau_achieved_goal = \ self._split_observation_fn(feature_pair) loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal) elif self._n_objects == 2: obs_tau_excludes_goal, obs_tau_achieved_goal_1, obs_tau_achieved_goal_2 \ = self._split_observation_fn( feature_pair) loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1) loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2) loss = loss_1 + loss_2 intrinsic_reward = () if calc_intrinsic_reward: # scale/normalize the MISC intrinsic reward if self._n_objects < 2: intrinsic_reward = tf.clip_by_value(self._mi_r_scale * loss, 0, 1) elif self._n_objects == 2: intrinsic_reward = tf.clip_by_value( self._mi_r_scale * loss_1, 0, 1) + 1 * tf.clip_by_value(self._mi_r_scale * loss_2, 0, 1) return AlgorithmStep( outputs=(), state=[feature_state, prev_action], \ info=MISCInfo(reward=intrinsic_reward))
def train_step(self, time_step: ActionTimeStep, state): """ Args: time_step (ActionTimeStep): input data for dynamics learning state (Tensor): state for dynamics learning (previous observation) Returns: TrainStep: outputs: empty tuple () state (DynamicsState): state for training info (DynamicsInfo): """ return AlgorithmStep(outputs=(), state=(), info=())
def predict(self, time_step: ActionTimeStep, state: DynamicsState): """Predict the next observation given the current time_step. The next step is predicted using the prev_action from time_step and the feature from state. """ action = self._encode_action(time_step.prev_action) obs = state.feature forward_delta, network_state = self._dynamics_network( inputs=[obs, action], network_state=state.network) forward_pred = obs + forward_delta state = state._replace(feature=forward_pred, network=network_state) return AlgorithmStep(outputs=forward_pred, state=state, info=())
def train_step(self, time_step: ActionTimeStep, state, calc_intrinsic_reward=True): """ Args: time_step (ActionTimeStep): input time_step data, where the observation is skill-augmened observation state (Tensor): state for DIAYN (previous skill) calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: skill info (DIAYNInfo): """ observations_aug = time_step.observation step_type = time_step.step_type observation, skill = observations_aug prev_skill = state if self._encoding_net is not None: feature, _ = self._encoding_net(observation) skill_pred, _ = self._discriminator_net(inputs=feature) skill_discriminate_loss = tf.nn.softmax_cross_entropy_with_logits( labels=prev_skill, logits=skill_pred) valid_masks = tf.cast( tf.not_equal(step_type, StepType.FIRST), tf.float32) skill_discriminate_loss = skill_discriminate_loss * valid_masks intrinsic_reward = () if calc_intrinsic_reward: # use negative cross-entropy as reward # neglect neg-prior term as it is constant intrinsic_reward = tf.stop_gradient(-skill_discriminate_loss) intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward) return AlgorithmStep( outputs=(), state=skill, info=DIAYNInfo( reward=intrinsic_reward, loss=LossInfo( loss=skill_discriminate_loss, extra=dict( skill_discriminate_loss=skill_discriminate_loss))))
def train_step(self, inputs, state=None): """Perform training on one batch of inputs. Args: inputs (tuple(Tensor, Tensor)): tuple of x and y state: not used Returns: AlgorithmStep outputs (Tensor): shape=[batch_size], its mean is the estimated MI state: not used info (LossInfo): info.loss is the loss """ x, y = inputs num_outer_dims = get_outer_rank(x, self._x_spec) batch_squash = BatchSquash(num_outer_dims) x = batch_squash.flatten(x) y = batch_squash.flatten(y) x1, y1 = self._sampler(x, y) log_ratio = self._model([x, y])[0] t1 = self._model([x1, y1])[0] if self._type == 'DV': ratio = tf.math.exp(tf.minimum(t1, 20)) mean = tf.stop_gradient(tf.reduce_mean(ratio)) if self._mean_averager: self._mean_averager.update(mean) unbiased_mean = tf.stop_gradient(self._mean_averager.get()) else: unbiased_mean = mean # estimated MI = reduce_mean(mi) # ratio/mean-1 does not contribute to the final estimated MI, since # mean(ratio/mean-1) = 0. We add it so that we can have an estimation # of the variance of the MI estimator mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1) loss = ratio / unbiased_mean - log_ratio elif self._type == 'KLD': ratio = tf.math.exp(tf.minimum(t1, 20)) mi = log_ratio - ratio + 1 loss = -mi elif self._type == 'JSD': mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4) loss = -mi mi = batch_squash.unflatten(mi) loss = batch_squash.unflatten(loss) return AlgorithmStep(outputs=mi, state=(), info=LossInfo(loss, extra=()))
def predict(self, inputs, batch_size=None, state=None): """Generate outputs given inputs. Args: inputs (nested Tensor): if None, the outputs is generated only from noise. batch_size (int): batch_size. Must be provided if inputs is None. Its is ignored if inputs is not None state: not used Returns: AlgorithmStep: outputs with shape (batch_size, output_dim) """ outputs, _ = self._predict(inputs, batch_size, training=False) return AlgorithmStep(outputs=outputs, state=(), info=())
def train_step(self, time_step: ActionTimeStep, state, calc_intrinsic_reward=True): """ Args: time_step (ActionTimeStep): input time_step data for ICM state (Tensor): state for ICM (previous observation) calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: observation info (ICMInfo): """ feature = time_step.observation prev_action = time_step.prev_action if self._encoding_net is not None: feature, _ = self._encoding_net(feature) prev_feature = state prev_action = self._encode_action(prev_action) forward_pred, _ = self._forward_net( inputs=[tf.stop_gradient(prev_feature), prev_action]) forward_loss = 0.5 * tf.reduce_mean( tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1) action_pred, _ = self._inverse_net(inputs=[prev_feature, feature]) if tensor_spec.is_discrete(self._action_spec): inverse_loss = tf.nn.softmax_cross_entropy_with_logits( labels=prev_action, logits=action_pred) else: inverse_loss = 0.5 * tf.reduce_mean( tf.square(prev_action - action_pred), axis=-1) intrinsic_reward = () if calc_intrinsic_reward: intrinsic_reward = tf.stop_gradient(forward_loss) intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward) return AlgorithmStep( outputs=(), state=feature, info=ICMInfo(reward=intrinsic_reward, loss=LossInfo(loss=forward_loss + inverse_loss, extra=dict(forward_loss=forward_loss, inverse_loss=inverse_loss))))
def train_step(self, inputs, state: MBPState): """Train one step. Args: inputs (tuple): a tuple of (observation, action) """ observation, _ = inputs latent_vector, kld, next_state = self.encode_step(inputs, state) # TODO: decoder for action decoder_loss = self.decode_step(latent_vector, observation) return AlgorithmStep( outputs=latent_vector, state=next_state, info=LossInfo(loss=self._loss_weight * (decoder_loss.loss + kld), extra=MBPLossInfo(decoder=decoder_loss, vae=kld)))
def train_step(self, time_step: ActionTimeStep, state, calc_intrinsic_reward=True): """ Args: time_step (ActionTimeStep): input time_step data state (tuple): empty tuple () calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: empty tuple () info: ICMInfo """ observation = time_step.observation if self._stacked_frames: # Assuming stacking in the last dim, we only keep the last frame. observation = observation[..., -1:] if self._observation_normalizer is not None: observation = self._observation_normalizer.normalize(observation) if self._encoder_net is not None: observation = tf.stop_gradient(self._encoder_net(observation)[0]) pred_embedding, _ = self._predictor_net(observation) target_embedding, _ = self._target_net(observation) loss = tf.reduce_sum( tf.square(pred_embedding - tf.stop_gradient(target_embedding)), axis=-1) intrinsic_reward = () if calc_intrinsic_reward: intrinsic_reward = tf.stop_gradient(loss) if self._reward_normalizer: intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward, clip_value=self._reward_clip_value) return AlgorithmStep( outputs=(), state=(), info=ICMInfo(reward=intrinsic_reward, loss=LossInfo(loss=loss)))
def train_step(self, distribution, step_type): """Train step. Args: distribution (nested Distribution): action distribution from the policy. Returns: AlgorithmStep. `info` field is LossInfo, other fields are empty. """ entropy, entropy_for_gradient = dist_utils.entropy_with_fallback( distribution, self._action_spec) return AlgorithmStep( outputs=(), state=(), info=EntropyTargetInfo( step_type=step_type, loss=LossInfo( loss=-entropy_for_gradient, extra=EntropyTargetLossInfo(entropy_loss=-entropy))))
def train_step(self, inputs, state=None): """Train one step. Args: inputs (tuple): tuple of (inputs, target) state (nested Tensor): network state for `decoder` Returns: AlgorithmStep with the following fields: outputs: decoding result state: rnn state info: loss of decoding """ input, target = inputs pred, state = self._decoder(input, network_state=state) assert pred.shape == target.shape loss = self._loss(target, pred) return AlgorithmStep(outputs=pred, state=state, info=LossInfo(self._loss_weight * loss, extra=()))
def train_step(self, inputs, state): """ Args: inputs (tuple): observation and previous action Returns: TrainStep: outputs: intrinsic reward state: info: """ feature, prev_action = inputs if self._encoding_net is not None: feature, _ = self._encoding_net(feature) prev_feature = state prev_action = self._encode_action(prev_action) forward_pred, _ = self._forward_net( inputs=[tf.stop_gradient(prev_feature), prev_action]) forward_loss = 0.5 * tf.reduce_mean( tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1) action_pred, _ = self._inverse_net(inputs=[prev_feature, feature]) if tensor_spec.is_discrete(self._action_spec): inverse_loss = tf.nn.softmax_cross_entropy_with_logits( labels=prev_action, logits=action_pred) else: inverse_loss = 0.5 * tf.reduce_mean( tf.square(prev_action - action_pred), axis=-1) intrinsic_reward = tf.stop_gradient(forward_loss) intrinsic_reward = self._reward_normalizer.normalize(intrinsic_reward) return AlgorithmStep(outputs=intrinsic_reward, state=feature, info=LossInfo(loss=forward_loss + inverse_loss, extra=ICMLossInfo( forward_loss=forward_loss, inverse_loss=inverse_loss)))
def train_step(self, time_step: ActionTimeStep, state: DynamicsState): """ Args: time_step (ActionTimeStep): input data for dynamics learning state (Tensor): state for dynamics learning (previous observation) Returns: TrainStep: outputs: empty tuple () state (DynamicsState): state for training info (DynamicsInfo): """ feature = time_step.observation dynamics_step = self.predict(time_step, state) forward_pred = dynamics_step.outputs forward_loss = 0.5 * tf.reduce_mean( tf.square(feature - forward_pred), axis=-1) info = DynamicsInfo( loss=LossInfo( loss=forward_loss, extra=dict(forward_loss=forward_loss))) state = DynamicsState(feature=feature) return AlgorithmStep(outputs=(), state=state, info=info)
def _ml_step(self, x, y, y_distribution): pmi = self._ml_pmi(x, y, y_distribution) return AlgorithmStep(outputs=pmi, state=(), info=LossInfo(loss=-pmi))
def train_step(self, inputs, y_distribution=None, state=None): """Perform training on one batch of inputs. Args: inputs (tuple(nested Tensor, nested Tensor)): tuple of x and y y_distribution (nested tfp.distributions.Distribution): distribution for the marginal distribution of y. If None, will use the sampling method `sampler` provided at constructor to generate the samples for the marginal distribution of Y. state: not used Returns: AlgorithmStep outputs (Tensor): shape=[batch_size], its mean is the estimated MI for estimator 'KL', 'DV' and 'KLD', and Jensen-Shannon divergence for estimator 'JSD' state: not used info (LossInfo): info.loss is the loss """ x, y = inputs if self._type == 'ML': return self._ml_step(x, y, y_distribution) num_outer_dims = get_outer_rank(x, self._x_spec) batch_squash = BatchSquash(num_outer_dims) x = batch_squash.flatten(x) y = batch_squash.flatten(y) if y_distribution is None: x1, y1 = self._sampler(x, y) else: x1 = x y1 = y_distribution.sample() y1 = batch_squash.flatten(y1) log_ratio = self._model([x, y])[0] t1 = self._model([x1, y1])[0] if self._type == 'DV': ratio = tf.math.exp(tf.minimum(t1, 20)) mean = tf.stop_gradient(tf.reduce_mean(ratio)) if self._mean_averager: self._mean_averager.update(mean) unbiased_mean = tf.stop_gradient(self._mean_averager.get()) else: unbiased_mean = mean # estimated MI = reduce_mean(mi) # ratio/mean-1 does not contribute to the final estimated MI, since # mean(ratio/mean-1) = 0. We add it so that we can have an estimation # of the variance of the MI estimator mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1) loss = ratio / unbiased_mean - log_ratio elif self._type == 'KLD': ratio = tf.math.exp(tf.minimum(t1, 20)) mi = log_ratio - ratio + 1 loss = -mi elif self._type == 'JSD': mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4) loss = -mi mi = batch_squash.unflatten(mi) loss = batch_squash.unflatten(loss) return AlgorithmStep( outputs=mi, state=(), info=LossInfo(loss, extra=()))