Ejemplo n.º 1
0
    def critic_learn(self, obs, action, reward, next_obs, terminal):
        noise = layers.gaussian_random_batch_size_like(
            action, shape=[-1, action.shape[1]])
        noise = layers.clip(noise * self.policy_noise,
                            min=-self.noise_clip,
                            max=self.noise_clip)
        next_action = self.target_model.policy(next_obs) + noise
        next_action = layers.clip(next_action, -self.max_action,
                                  self.max_action)

        next_Q1, next_Q2 = self.target_model.value(next_obs, next_action)
        next_Q = layers.elementwise_min(next_Q1, next_Q2)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.model.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
Ejemplo n.º 2
0
    def actor_learn(self, obs):
        action, log_pi = self.sample(obs)
        qf1_pi, qf2_pi = self.critic.value(obs, action)
        min_qf_pi = layers.elementwise_min(qf1_pi, qf2_pi)
        cost = log_pi * self.alpha - min_qf_pi
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
        optimizer.minimize(cost, parameter_list=self.actor.parameters())

        return cost
Ejemplo n.º 3
0
    def critic_learn(self, obs, action, reward, next_obs, terminal):
        next_obs_action, next_obs_log_pi = self.sample(next_obs)
        qf1_next_target, qf2_next_target = self.target_critic.value(
            next_obs, next_obs_action)
        min_qf_next_target = layers.elementwise_min(
            qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.critic.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
Ejemplo n.º 4
0
    def policy_learn(self, obs, actions, advantages, beta=None):
        """ Learn policy model with: 
                1. CLIP loss: Clipped Surrogate Objective 
                2. KLPEN loss: Adaptive KL Penalty Objective
            See: https://arxiv.org/pdf/1707.02286.pdf

        Args:
            obs: Tensor, (batch_size, obs_dim)
            actions: Tensor, (batch_size, act_dim)
            advantages: Tensor (batch_size, )
            beta: Tensor (1) or None
                  if None, use CLIP Loss; else, use KLPEN loss. 
        """
        old_means, old_logvars = self.old_policy_model.policy(obs)
        old_means.stop_gradient = True
        old_logvars.stop_gradient = True
        old_logprob = self._calc_logprob(actions, old_means, old_logvars)

        means, logvars = self.model.policy(obs)
        logprob = self._calc_logprob(actions, means, logvars)

        kl = self._calc_kl(means, logvars, old_means, old_logvars)
        kl = layers.reduce_mean(kl)

        if beta is None:  # Clipped Surrogate Objective
            pg_ratio = layers.exp(logprob - old_logprob)
            clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon,
                                           1 + self.epsilon)
            surrogate_loss = layers.elementwise_min(
                advantages * pg_ratio, advantages * clipped_pg_ratio)
            loss = 0 - layers.reduce_mean(surrogate_loss)
        else:  # Adaptive KL Penalty Objective
            # policy gradient loss
            loss1 = 0 - layers.reduce_mean(
                advantages * layers.exp(logprob - old_logprob))
            # adaptive kl loss
            loss2 = kl * beta
            loss = loss1 + loss2
        optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr)
        optimizer.minimize(loss)
        return loss, kl
Ejemplo n.º 5
0
def from_importance_weights(behaviour_actions_log_probs,
                            target_actions_log_probs,
                            discounts,
                            rewards,
                            values,
                            bootstrap_value,
                            clip_rho_threshold=1.0,
                            clip_pg_rho_threshold=1.0,
                            name='vtrace_from_logits'):
    r"""V-trace for softmax policies.

    Calculates V-trace actor critic targets for softmax polices as described in

    "IMPALA: Scalable Distributed Deep-RL with
    Importance Weighted Actor-Learner Architectures"
    by Espeholt, Soyer, Munos et al.

    Target policy refers to the policy we are interested in improving and
    behaviour policy refers to the policy that generated the given
    rewards and actions.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size and
    NUM_ACTIONS refers to the number of actions.

    Args:
      behaviour_actions_log_probs: A float32 tensor of shape [T, B] of
        log-probabilities of actions in behaviour policy.
      target_policy_logits: A float32 tensor of shape [T, B] of
        log-probabilities of actions in target policy.
      discounts: A float32 tensor of shape [T, B] with the discount encountered
        when following the behaviour policy.
      rewards: A float32 tensor of shape [T, B] with the rewards generated by
        following the behaviour policy.
      values: A float32 tensor of shape [T, B] with the value function estimates
        wrt. the target policy.
      bootstrap_value: A float32 of shape [B] with the value function estimate at
        time T.
      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
        importance weights (rho) when calculating the baseline targets (vs).
        rho^bar in the paper.
      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
      name: The name scope that all V-trace operations will be created in.

    Returns:
      A VTraceReturns namedtuple (vs, pg_advantages) where:
        vs: A float32 tensor of shape [T, B]. Can be used as target to
          train a baseline (V(x_t) - vs_t)^2.
        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
          advantage in the calculation of policy gradients.
    """

    # rank = len(behaviour_actions_log_probs.shape)  # Usually 2.
    # assert len(target_actions_log_probs.shape) == rank
    # assert len(values.shape) == rank
    # assert len(bootstrap_value.shape) == (rank - 1)
    # assert len(discounts.shape) == rank
    # assert len(rewards.shape) == rank

    # log importance sampling weights.
    # V-trace performs operations on rhos in log-space for numerical stability.
    log_rhos = behaviour_actions_log_probs - target_actions_log_probs

    if clip_rho_threshold is not None:
        clip_rho_threshold = layers.fill_constant([1], 'float32',
                                                  clip_rho_threshold)
    if clip_pg_rho_threshold is not None:
        clip_pg_rho_threshold = layers.fill_constant([1], 'float32',
                                                     clip_pg_rho_threshold)

    rhos = layers.exp(log_rhos)
    if clip_rho_threshold is not None:
        clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold)
    else:
        clipped_rhos = rhos

    constant_one = layers.fill_constant([1], 'float32', 1.0)
    cs = layers.elementwise_min(rhos, constant_one)

    # Append bootstrapped value to get [v1, ..., v_t+1]
    values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32])
    values_t_plus_1 = layers.concat(
        [values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)

    # \delta_s * V
    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)

    vs_minus_v_xs = recursively_scan(discounts, cs, deltas)

    # Add V(x_s) to get v_s.
    vs = layers.elementwise_add(vs_minus_v_xs, values)

    # Advantage for policy gradient.
    vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32])
    vs_t_plus_1 = layers.concat(
        [vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)

    if clip_pg_rho_threshold is not None:
        clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold)
    else:
        clipped_pg_rhos = rhos
    pg_advantages = (clipped_pg_rhos *
                     (rewards + discounts * vs_t_plus_1 - values))

    # Make sure no gradients backpropagated through the returned values.
    vs.stop_gradient = True
    pg_advantages.stop_gradient = True
    return VTraceReturns(vs=vs, pg_advantages=pg_advantages)