예제 #1
0
    def train_step(self, inputs, loss_func, batch_size=None, state=None):
        """
        Args:
            inputs (nested Tensor): if None, the outputs is generated only from
                noise.
            loss_func (Callable): loss_func([outputs, inputs])
                (loss_func(outputs) if inputs is None) returns a Tensor with
                shape [batch_size] as a loss for optimizing the generator
            batch_size (int): batch_size. Must be provided if inputs is None.
                Its is ignored if inputs is not None
            state: not used
        Returns:
            AlgorithmStep:
                outputs: Tensor with shape (batch_size, dim)
                info: LossInfo
        """
        outputs, gen_inputs = self._predict(inputs, batch_size)
        loss, grad = self._grad_func(inputs, outputs, loss_func)
        loss_propagated = tf.reduce_sum(
            tf.stop_gradient(grad) * outputs, axis=-1)

        mi_loss = ()
        if self._mi_estimator is not None:
            mi_step = self._mi_estimator.train_step([gen_inputs, outputs])
            mi_loss = mi_step.info.loss
            loss_propagated += self._mi_weight * mi_loss

        return AlgorithmStep(
            outputs=outputs,
            state=(),
            info=LossInfo(
                loss=loss_propagated,
                extra=GeneratorLossInfo(generator=loss, mi_estimator=mi_loss)))
예제 #2
0
    def train_step(self, inputs, state=None):
        """Train one step.

        Args:
            inputs (tuple): tuple of (inputs, target)
            state (nested Tensor): network state for `decoder`

        Returns:
            AlgorithmStep with the following fields:
            outputs: decoding result
            state: rnn state
            info: loss of decoding

        """
        input, target = inputs
        pred, state = self._decoder(input, network_state=state)
        assert pred.shape == target.shape
        loss = self._loss(target, pred)

        if len(loss.shape) > 1:
            # reduce to (B,)
            reduce_dims = list(range(1, len(loss.shape)))
            loss = tf.reduce_sum(loss, axis=reduce_dims)
        return AlgorithmStep(outputs=pred,
                             state=state,
                             info=LossInfo(loss=self._loss_weight * loss))
예제 #3
0
    def train_step(self, inputs, state, calc_intrinsic_reward=True):
        """
        Args:
            inputs (tuple): observation
            state (tuple):  empty tuple ()
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: empty tuple ()
                info: RNDInfo
        """
        observation, _ = inputs
        if self._observation_normalizer is not None:
            observation = self._observation_normalizer.normalize(observation)

        pred_embedding, _ = self._predictor_net(observation)
        target_embedding, _ = self._target_net(observation)

        loss = 0.5 * tf.reduce_mean(
            tf.square(pred_embedding - tf.stop_gradient(target_embedding)),
            axis=-1)

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            intrinsic_reward = tf.stop_gradient(loss)
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgorithmStep(outputs=(),
                             state=(),
                             info=RNDInfo(reward=intrinsic_reward,
                                          loss=LossInfo(loss=loss)))
예제 #4
0
    def train_step(self, inputs, state=None):
        """Perform training on one batch of inputs.

        Args:
            inputs (tuple(Tensor, Tensor)): tuple of x and y
            state: not used
        Returns:
            AlgorithmStep
                outputs (Tensor): shape=[batch_size], its mean is the estimated
                    MI
                state: not used
                info (LossInfo): info.loss is the loss
        """
        x, y = inputs
        num_outer_dims = get_outer_rank(x, self._x_spec)
        batch_squash = BatchSquash(num_outer_dims)
        x = batch_squash.flatten(x)
        y = batch_squash.flatten(y)
        x1, y1 = self._sampler(x, y)

        log_ratio = self._model([x, y])[0]
        t1 = self._model([x1, y1])[0]

        if self._type == 'DV':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mean = tf.stop_gradient(tf.reduce_mean(ratio))
            if self._mean_averager:
                self._mean_averager.update(mean)
                unbiased_mean = tf.stop_gradient(self._mean_averager.get())
            else:
                unbiased_mean = mean
            # estimated MI = reduce_mean(mi)
            # ratio/mean-1 does not contribute to the final estimated MI, since
            # mean(ratio/mean-1) = 0. We add it so that we can have an estimation
            # of the variance of the MI estimator
            mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1)
            loss = ratio / unbiased_mean - log_ratio
        elif self._type == 'KLD':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mi = log_ratio - ratio + 1
            loss = -mi
        elif self._type == 'JSD':
            mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4)
            loss = -mi

        mi = batch_squash.unflatten(mi)
        loss = batch_squash.unflatten(loss)

        return AlgorithmStep(outputs=mi,
                             state=(),
                             info=LossInfo(loss, extra=()))
예제 #5
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data, where the
            observation is skill-augmened observation
            state (Tensor): state for DIAYN (previous skill)
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: skill
                info (DIAYNInfo):
        """
        observations_aug = time_step.observation
        step_type = time_step.step_type
        observation, skill = observations_aug
        prev_skill = state

        if self._encoding_net is not None:
            feature, _ = self._encoding_net(observation)

        skill_pred, _ = self._discriminator_net(inputs=feature)

        skill_discriminate_loss = tf.nn.softmax_cross_entropy_with_logits(
            labels=prev_skill, logits=skill_pred)

        valid_masks = tf.cast(
            tf.not_equal(step_type, StepType.FIRST), tf.float32)
        skill_discriminate_loss = skill_discriminate_loss * valid_masks

        intrinsic_reward = ()

        if calc_intrinsic_reward:
            # use negative cross-entropy as reward
            # neglect neg-prior term as it is constant
            intrinsic_reward = tf.stop_gradient(-skill_discriminate_loss)
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgorithmStep(
            outputs=(),
            state=skill,
            info=DIAYNInfo(
                reward=intrinsic_reward,
                loss=LossInfo(
                    loss=skill_discriminate_loss,
                    extra=dict(
                        skill_discriminate_loss=skill_discriminate_loss))))
예제 #6
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data for ICM
            state (Tensor): state for ICM (previous observation)
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: observation
                info (ICMInfo):
        """
        feature = time_step.observation
        prev_action = time_step.prev_action

        if self._encoding_net is not None:
            feature, _ = self._encoding_net(feature)
        prev_feature = state
        prev_action = self._encode_action(prev_action)

        forward_pred, _ = self._forward_net(
            inputs=[tf.stop_gradient(prev_feature), prev_action])
        forward_loss = 0.5 * tf.reduce_mean(
            tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1)

        action_pred, _ = self._inverse_net(inputs=[prev_feature, feature])

        if tensor_spec.is_discrete(self._action_spec):
            inverse_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=prev_action, logits=action_pred)
        else:
            inverse_loss = 0.5 * tf.reduce_mean(
                tf.square(prev_action - action_pred), axis=-1)

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            intrinsic_reward = tf.stop_gradient(forward_loss)
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgorithmStep(
            outputs=(),
            state=feature,
            info=ICMInfo(reward=intrinsic_reward,
                         loss=LossInfo(loss=forward_loss + inverse_loss,
                                       extra=dict(forward_loss=forward_loss,
                                                  inverse_loss=inverse_loss))))
예제 #7
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data
            state (tuple):  empty tuple ()
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: empty tuple ()
                info: ICMInfo
        """
        observation = time_step.observation

        if self._stacked_frames:
            # Assuming stacking in the last dim, we only keep the last frame.
            observation = observation[..., -1:]

        if self._observation_normalizer is not None:
            observation = self._observation_normalizer.normalize(observation)

        if self._encoder_net is not None:
            observation = tf.stop_gradient(self._encoder_net(observation)[0])

        pred_embedding, _ = self._predictor_net(observation)
        target_embedding, _ = self._target_net(observation)

        loss = tf.reduce_sum(
            tf.square(pred_embedding - tf.stop_gradient(target_embedding)),
            axis=-1)

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            intrinsic_reward = tf.stop_gradient(loss)
            if self._reward_normalizer:
                intrinsic_reward = self._reward_normalizer.normalize(
                    intrinsic_reward, clip_value=self._reward_clip_value)

        return AlgorithmStep(
            outputs=(),
            state=(),
            info=ICMInfo(reward=intrinsic_reward, loss=LossInfo(loss=loss)))
예제 #8
0
    def calc_loss(self, info: MISCInfo):
        feature_tau_sampled = self._buffer.get_batch(
            batch_size=self._buffer_size)
        feature_tau_sampled_tran = transpose2(feature_tau_sampled, 1, 0)
        if self._n_objects < 2:
            obs_tau_excludes_goal, obs_tau_achieved_goal = (
                self._split_observation_fn(feature_tau_sampled_tran))
            loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal)
        elif self._n_objects == 2:
            (obs_tau_excludes_goal, obs_tau_achieved_goal_1,
             obs_tau_achieved_goal_2
             ) = self._split_observation_fn(feature_tau_sampled_tran)
            loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1)
            loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2)
            loss = loss_1 + loss_2

        neg_loss = -loss
        neg_loss_scalar = tf.reduce_mean(neg_loss)
        return LossInfo(scalar_loss=neg_loss_scalar)
예제 #9
0
    def train_step(self, distribution, step_type):
        """Train step.

        Args:
            distribution (nested Distribution): action distribution from the
                policy.
        Returns:
            AlgorithmStep. `info` field is LossInfo, other fields are empty.
        """
        entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
            distribution, self._action_spec)
        return AlgorithmStep(
            outputs=(),
            state=(),
            info=EntropyTargetInfo(
                step_type=step_type,
                loss=LossInfo(
                    loss=-entropy_for_gradient,
                    extra=EntropyTargetLossInfo(entropy_loss=-entropy))))
예제 #10
0
    def train_step(self, inputs, state=None):
        """Train one step.

        Args:
            inputs (tuple): tuple of (inputs, target)
            state (nested Tensor): network state for `decoder`

        Returns:
            AlgorithmStep with the following fields:
            outputs: decoding result
            state: rnn state
            info: loss of decoding

        """
        input, target = inputs
        pred, state = self._decoder(input, network_state=state)
        assert pred.shape == target.shape
        loss = self._loss(target, pred)
        return AlgorithmStep(outputs=pred,
                             state=state,
                             info=LossInfo(loss=self._loss_weight * loss))
예제 #11
0
    def train_step(self, time_step: ActionTimeStep, state: DynamicsState):
        """
        Args:
            time_step (ActionTimeStep): input data for dynamics learning
            state (Tensor): state for dynamics learning (previous observation)
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state (DynamicsState): state for training
                info (DynamicsInfo):
        """
        feature = time_step.observation
        dynamics_step = self.predict(time_step, state)
        forward_pred = dynamics_step.outputs
        forward_loss = 0.5 * tf.reduce_mean(
            tf.square(feature - forward_pred), axis=-1)

        info = DynamicsInfo(
            loss=LossInfo(
                loss=forward_loss, extra=dict(forward_loss=forward_loss)))
        state = DynamicsState(feature=feature)

        return AlgorithmStep(outputs=(), state=state, info=info)
예제 #12
0
 def calc_loss(self, info):
     loss = tf.nest.map_structure(tf.reduce_mean, info.loss)
     return LossInfo(
         loss=info.loss, scalar_loss=loss.loss, extra=loss.extra)
예제 #13
0
파일: mi_estimator.py 프로젝트: zhuboli/alf
 def _ml_step(self, x, y, y_distribution):
     pmi = self._ml_pmi(x, y, y_distribution)
     return AlgStep(output=pmi, state=(), info=LossInfo(loss=-pmi))
예제 #14
0
파일: mi_estimator.py 프로젝트: zhuboli/alf
    def train_step(self, inputs, y_distribution=None, state=None):
        """Perform training on one batch of inputs.

        Args:
            inputs (tuple(nested Tensor, nested Tensor)): tuple of ``x`` and ``y``
            y_distribution (nested td.Distribution): distribution
                for the marginal distribution of ``y``. If None, will use the
                sampling method ``sampler`` provided at constructor to generate
                the samples for the marginal distribution of :math:`Y`.
            state: not used
        Returns:
            AlgStep:
            - outputs (Tensor): shape is ``[batch_size]``, its mean is the
              estimated MI for estimator 'KL', 'DV' and 'KLD', and
              Jensen-Shannon divergence for estimator 'JSD'
            - state: not used
            - info (LossInfo): ``info.loss`` is the loss
        """
        x, y = inputs

        if self._type == 'ML':
            return self._ml_step(x, y, y_distribution)

        num_outer_dims = get_outer_rank(x, self._x_spec)
        batch_squash = BatchSquash(num_outer_dims)
        x = batch_squash.flatten(x)
        y = batch_squash.flatten(y)
        if y_distribution is None:
            x1, y1 = self._sampler(x, y)
        else:
            x1 = x
            y1 = y_distribution.sample()
            y1 = batch_squash.flatten(y1)

        log_ratio = self._model([x, y])[0]
        t1 = self._model([x1, y1])[0]

        if self._type == 'DV':
            ratio = torch.min(t1, torch.tensor(20.)).exp()
            mean = ratio.mean().detach()
            if self._mean_averager:
                self._mean_averager.update(mean)
                unbiased_mean = self._mean_averager.get().detach()
            else:
                unbiased_mean = mean
            # estimated MI = reduce_mean(mi)
            # ratio/mean-1 does not contribute to the final estimated MI, since
            # mean(ratio/mean-1) = 0. We add it so that we can have an estimation
            # of the variance of the MI estimator
            mi = log_ratio - (mean.log() + ratio / mean - 1)
            loss = ratio / unbiased_mean - log_ratio
        elif self._type == 'KLD':
            ratio = torch.min(t1, torch.tensor(20.)).exp()
            mi = log_ratio - ratio + 1
            loss = -mi
        elif self._type == 'JSD':
            mi = -F.softplus(-log_ratio) - F.softplus(t1) + math.log(4)
            loss = -mi
        mi = batch_squash.unflatten(mi)
        loss = batch_squash.unflatten(loss)

        return AlgStep(output=mi, state=(), info=LossInfo(loss, extra=()))
예제 #15
0
    def train_step(self, inputs, y_distribution=None, state=None):
        """Perform training on one batch of inputs.

        Args:
            inputs (tuple(nested Tensor, nested Tensor)): tuple of x and y
            y_distribution (nested tfp.distributions.Distribution): distribution
                for the marginal distribution of y. If None, will use the
                sampling method `sampler` provided at constructor to generate
                the samples for the marginal distribution of Y.
            state: not used
        Returns:
            AlgorithmStep
                outputs (Tensor): shape=[batch_size], its mean is the estimated
                    MI for estimator 'KL', 'DV' and 'KLD', and Jensen-Shannon
                    divergence for estimator 'JSD'
                state: not used
                info (LossInfo): info.loss is the loss
        """
        x, y = inputs

        if self._type == 'ML':
            return self._ml_step(x, y, y_distribution)

        num_outer_dims = get_outer_rank(x, self._x_spec)
        batch_squash = BatchSquash(num_outer_dims)
        x = batch_squash.flatten(x)
        y = batch_squash.flatten(y)
        if y_distribution is None:
            x1, y1 = self._sampler(x, y)
        else:
            x1 = x
            y1 = y_distribution.sample()
            y1 = batch_squash.flatten(y1)

        log_ratio = self._model([x, y])[0]
        t1 = self._model([x1, y1])[0]

        if self._type == 'DV':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mean = tf.stop_gradient(tf.reduce_mean(ratio))
            if self._mean_averager:
                self._mean_averager.update(mean)
                unbiased_mean = tf.stop_gradient(self._mean_averager.get())
            else:
                unbiased_mean = mean
            # estimated MI = reduce_mean(mi)
            # ratio/mean-1 does not contribute to the final estimated MI, since
            # mean(ratio/mean-1) = 0. We add it so that we can have an estimation
            # of the variance of the MI estimator
            mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1)
            loss = ratio / unbiased_mean - log_ratio
        elif self._type == 'KLD':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mi = log_ratio - ratio + 1
            loss = -mi
        elif self._type == 'JSD':
            mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4)
            loss = -mi
        mi = batch_squash.unflatten(mi)
        loss = batch_squash.unflatten(loss)

        return AlgorithmStep(
            outputs=mi, state=(), info=LossInfo(loss, extra=()))
예제 #16
0
 def calc_loss(self, info: ICMInfo):
     return LossInfo(scalar_loss=tf.reduce_mean(info.loss.loss))