def kl(self, other):
        """
        Args:
            other: object of CategoricalDistribution

        Returns:
            kl: A float32 tensor with shape [BATCH_SIZE]
        """
        assert isinstance(other, CategoricalDistribution)

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        other_logits = other.logits - layers.reduce_max(other.logits, dim=1)

        e_logits = layers.exp(logits)
        other_e_logits = layers.exp(other_logits)

        z = layers.reduce_sum(e_logits, dim=1)
        other_z = layers.reduce_sum(other_e_logits, dim=1)

        prob = e_logits / z
        kl = layers.reduce_sum(
            prob *
            (logits - layers.log(z) - other_logits + layers.log(other_z)),
            dim=1)
        return kl
    def logp(self, actions, eps=1e-6):
        """
        Args:
            actions: An int64 tensor with shape [BATCH_SIZE]
            eps: A small float constant that avoids underflows when computing the log probability

        Returns:
            actions_log_prob: A float32 tensor with shape [BATCH_SIZE]
        """
        assert len(actions.shape) == 1

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z

        actions = layers.unsqueeze(actions, axes=[1])
        actions_onehot = layers.one_hot(actions, prob.shape[1])
        actions_onehot = layers.cast(actions_onehot, dtype='float32')
        actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1)

        actions_prob = actions_prob + eps
        actions_log_prob = layers.log(actions_prob)

        return actions_log_prob
Beispiel #3
0
    def policy_learn(self, obs, actions, advantages, beta=None):
        """ Learn policy model with: 
                1. CLIP loss: Clipped Surrogate Objective 
                2. KLPEN loss: Adaptive KL Penalty Objective
            See: https://arxiv.org/pdf/1707.02286.pdf

        Args:
            obs: Tensor, (batch_size, obs_dim)
            actions: Tensor, (batch_size, act_dim)
            advantages: Tensor (batch_size, )
            beta: Tensor (1) or None
                  if None, use CLIP Loss; else, use KLPEN loss. 
        """
        old_means, old_logvars = self.old_policy_model.policy(obs)
        old_means.stop_gradient = True
        old_logvars.stop_gradient = True
        old_logprob = self._calc_logprob(actions, old_means, old_logvars)

        means, logvars = self.model.policy(obs)
        logprob = self._calc_logprob(actions, means, logvars)

        kl = self._calc_kl(means, logvars, old_means, old_logvars)
        kl = layers.reduce_mean(kl)

        if beta is None:  # Clipped Surrogate Objective
            pg_ratio = layers.exp(logprob - old_logprob)
            clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon,
                                           1 + self.epsilon)
            surrogate_loss = layers.elementwise_min(
                advantages * pg_ratio, advantages * clipped_pg_ratio)
            loss = 0 - layers.reduce_mean(surrogate_loss)
        else:  # Adaptive KL Penalty Objective
            # policy gradient loss
            loss1 = 0 - layers.reduce_mean(
                advantages * layers.exp(logprob - old_logprob))
            # adaptive kl loss
            loss2 = kl * beta
            loss = loss1 + loss2
        optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr)
        optimizer.minimize(loss)
        return loss, kl
Beispiel #4
0
    def _calc_kl(self, means, logvars, old_means, old_logvars):
        """ Calculate KL divergence between old and new distributions
            See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence

        Args:
            means: shape (batch_size, act_dim)
            logvars: shape (act_dim)
            old_means: shape (batch_size, act_dim)
            old_logvars: shape (act_dim)

        Returns:
            kl: shape (batch_size)
        """
        log_det_cov_old = layers.reduce_sum(old_logvars)
        log_det_cov_new = layers.reduce_sum(logvars)
        tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars))
        kl = 0.5 * (layers.reduce_sum(
            layers.square(means - old_means) / layers.exp(logvars), dim=1) +
                    (log_det_cov_new - log_det_cov_old) + tr_old_new -
                    self.act_dim)
        return kl
Beispiel #5
0
 def sample(self, obs):
     mean, log_std = self.model.policy(obs)
     std = layers.exp(log_std)
     normal = Normal(mean, std)
     x_t = normal.sample([1])[0]
     y_t = layers.tanh(x_t)
     action = y_t * self.max_action
     # log_prob = normal.log_prob(x_t)
     # log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) + epsilon)
     # log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True)
     # log_prob = layers.squeeze(log_prob, axes=[1])
     return action, mean, log_std
    def entropy(self):
        """
        Returns:
            entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution.
        """
        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z
        entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)),
                                           dim=1)

        return entropy
Beispiel #7
0
    def _calc_logprob(self, actions, means, logvars):
        """ Calculate log probabilities of actions, when given means and logvars
            of normal distribution.
            The constant sqrt(2 * pi) is omitted, which will be eliminated in later.

        Args:
            actions: shape (batch_size, act_dim)
            means:   shape (batch_size, act_dim)
            logvars: shape (act_dim)

        Returns:
            logprob: shape (batch_size)
        """
        exp_item = layers.elementwise_div(layers.square(actions - means),
                                          layers.exp(logvars),
                                          axis=1)
        exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1)

        vars_item = -0.5 * layers.reduce_sum(logvars)
        logprob = exp_item + vars_item
        return logprob
Beispiel #8
0
def from_importance_weights(behaviour_actions_log_probs,
                            target_actions_log_probs,
                            discounts,
                            rewards,
                            values,
                            bootstrap_value,
                            clip_rho_threshold=1.0,
                            clip_pg_rho_threshold=1.0,
                            name='vtrace_from_logits'):
    r"""V-trace for softmax policies.

    Calculates V-trace actor critic targets for softmax polices as described in

    "IMPALA: Scalable Distributed Deep-RL with
    Importance Weighted Actor-Learner Architectures"
    by Espeholt, Soyer, Munos et al.

    Target policy refers to the policy we are interested in improving and
    behaviour policy refers to the policy that generated the given
    rewards and actions.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size and
    NUM_ACTIONS refers to the number of actions.

    Args:
      behaviour_actions_log_probs: A float32 tensor of shape [T, B] of
        log-probabilities of actions in behaviour policy.
      target_policy_logits: A float32 tensor of shape [T, B] of
        log-probabilities of actions in target policy.
      discounts: A float32 tensor of shape [T, B] with the discount encountered
        when following the behaviour policy.
      rewards: A float32 tensor of shape [T, B] with the rewards generated by
        following the behaviour policy.
      values: A float32 tensor of shape [T, B] with the value function estimates
        wrt. the target policy.
      bootstrap_value: A float32 of shape [B] with the value function estimate at
        time T.
      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
        importance weights (rho) when calculating the baseline targets (vs).
        rho^bar in the paper.
      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
      name: The name scope that all V-trace operations will be created in.

    Returns:
      A VTraceReturns namedtuple (vs, pg_advantages) where:
        vs: A float32 tensor of shape [T, B]. Can be used as target to
          train a baseline (V(x_t) - vs_t)^2.
        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
          advantage in the calculation of policy gradients.
    """

    # rank = len(behaviour_actions_log_probs.shape)  # Usually 2.
    # assert len(target_actions_log_probs.shape) == rank
    # assert len(values.shape) == rank
    # assert len(bootstrap_value.shape) == (rank - 1)
    # assert len(discounts.shape) == rank
    # assert len(rewards.shape) == rank

    # log importance sampling weights.
    # V-trace performs operations on rhos in log-space for numerical stability.
    log_rhos = behaviour_actions_log_probs - target_actions_log_probs

    if clip_rho_threshold is not None:
        clip_rho_threshold = layers.fill_constant([1], 'float32',
                                                  clip_rho_threshold)
    if clip_pg_rho_threshold is not None:
        clip_pg_rho_threshold = layers.fill_constant([1], 'float32',
                                                     clip_pg_rho_threshold)

    rhos = layers.exp(log_rhos)
    if clip_rho_threshold is not None:
        clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold)
    else:
        clipped_rhos = rhos

    constant_one = layers.fill_constant([1], 'float32', 1.0)
    cs = layers.elementwise_min(rhos, constant_one)

    # Append bootstrapped value to get [v1, ..., v_t+1]
    values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32])
    values_t_plus_1 = layers.concat(
        [values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)

    # \delta_s * V
    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)

    vs_minus_v_xs = recursively_scan(discounts, cs, deltas)

    # Add V(x_s) to get v_s.
    vs = layers.elementwise_add(vs_minus_v_xs, values)

    # Advantage for policy gradient.
    vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32])
    vs_t_plus_1 = layers.concat(
        [vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)

    if clip_pg_rho_threshold is not None:
        clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold)
    else:
        clipped_pg_rhos = rhos
    pg_advantages = (clipped_pg_rhos *
                     (rewards + discounts * vs_t_plus_1 - values))

    # Make sure no gradients backpropagated through the returned values.
    vs.stop_gradient = True
    pg_advantages.stop_gradient = True
    return VTraceReturns(vs=vs, pg_advantages=pg_advantages)
Beispiel #9
0
    def learn(self, obs, actions, means, log_std, rewards, dones,
              learning_rate, entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
            rewards: A float32 tensor of shape [B].
            dones: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """
        values = self.model.value(obs)
        # pi
        log_std = layers.exp(log_std)
        normal_pi = Normal(means, log_std)
        # x_t1 = normal_pi.sample([1])[0]
        # x_t1.stop_gradient = True
        y_t1 = actions / self.max_action
        # action1 = y_t1 * self.max_action
        log_prob1 = normal_pi.log_prob(actions)
        log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True)
        log_prob_pi = layers.squeeze(log_prob1, axes=[1])

        # mu
        actions_mu, log_std_mu = self.model.policy(obs)
        log_std_mu = layers.exp(log_std_mu)
        normal_mu = Normal(actions_mu, log_std_mu)
        # x_t2 = normal_mu.sample([1])[0]
        # x_t2.stop_gradient = True
        # y_t2 = actions
        # action2 = y_t2 * self.max_action
        log_prob2 = normal_mu.log_prob(actions)
        log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True)
        log_prob_mu = layers.squeeze(log_prob2, axes=[1])

        # target_policy_distribution = CategoricalDistribution(target_logits)
        # behaviour_policy_distribution = CategoricalDistribution(
        #     behaviour_logits)

        policy_entropy = normal_mu.entropy()
        # policy_entropy = layers.reduce_mean(policy_entropy, dim=1)
        target_actions_log_probs = log_prob_mu
        behaviour_actions_log_probs = log_prob_pi

        # Calculating kl for debug
        # kl = target_policy_distribution.kl(behaviour_policy_distribution)
        kl = normal_mu.kl_divergence(normal_pi)
        kl = layers.reduce_mean(kl, dim=1)
        # kl = layers.unsqueeze(kl, axes=[1])
        """
        Split the tensor into batches at known episode cut boundaries. 
        [B * T] -> [T, B]
        """
        T = self.sample_batch_steps

        def split_batches(tensor):
            B = tensor.shape[0] // T
            splited_tensor = layers.reshape(tensor,
                                            [B, T] + list(tensor.shape[1:]))
            # transpose B and T
            return layers.transpose(splited_tensor, [1, 0] +
                                    list(range(2, 1 + len(tensor.shape))))

        behaviour_actions_log_probs = split_batches(
            behaviour_actions_log_probs)
        target_actions_log_probs = split_batches(target_actions_log_probs)
        policy_entropy = split_batches(policy_entropy)
        dones = split_batches(dones)
        rewards = split_batches(rewards)
        values = split_batches(values)

        # [T, B] -> [T - 1, B] for V-trace calc.
        behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs,
                                                   axes=[0],
                                                   starts=[0],
                                                   ends=[-1])
        target_actions_log_probs = layers.slice(target_actions_log_probs,
                                                axes=[0],
                                                starts=[0],
                                                ends=[-1])
        policy_entropy = layers.slice(policy_entropy,
                                      axes=[0],
                                      starts=[0],
                                      ends=[-1])
        dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
        rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
        bootstrap_value = layers.slice(values,
                                       axes=[0],
                                       starts=[T - 1],
                                       ends=[T])
        values = layers.slice(values, axes=[0], starts=[0], ends=[-1])

        bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])

        vtrace_loss = VTraceLoss(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            policy_entropy=policy_entropy,
            dones=dones,
            discount=self.gamma,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            entropy_coeff=entropy_coeff,
            vf_loss_coeff=self.vf_loss_coeff,
            clip_rho_threshold=self.clip_rho_threshold,
            clip_pg_rho_threshold=self.clip_pg_rho_threshold)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(vtrace_loss.total_loss)
        return vtrace_loss, kl