Esempio n. 1
0
    def kl(self, other):
        """
        Args:
            other: object of CategoricalDistribution

        Returns:
            kl: A float32 tensor with shape [BATCH_SIZE]
        """
        assert isinstance(other, CategoricalDistribution)

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        other_logits = other.logits - layers.reduce_max(other.logits, dim=1)

        e_logits = layers.exp(logits)
        other_e_logits = layers.exp(other_logits)

        z = layers.reduce_sum(e_logits, dim=1)
        other_z = layers.reduce_sum(other_e_logits, dim=1)

        prob = e_logits / z
        kl = layers.reduce_sum(
            prob *
            (logits - layers.log(z) - other_logits + layers.log(other_z)),
            dim=1)
        return kl
Esempio n. 2
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        # print("obs:",obs)
        # raise NotImplementedError
        # obs = layers.squeeze(input=obs,axes=[-1])
        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta
Esempio n. 3
0
    def logp(self, actions, eps=1e-6):
        """
        Args:
            actions: An int64 tensor with shape [BATCH_SIZE]
            eps: A small float constant that avoids underflows when computing the log probability

        Returns:
            actions_log_prob: A float32 tensor with shape [BATCH_SIZE]
        """
        assert len(actions.shape) == 1

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z

        actions = layers.unsqueeze(actions, axes=[1])
        actions_onehot = layers.one_hot(actions, prob.shape[1])
        actions_onehot = layers.cast(actions_onehot, dtype='float32')
        actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1)

        actions_prob = actions_prob + eps
        actions_log_prob = layers.log(actions_prob)

        return actions_log_prob
Esempio n. 4
0
    def __init__(self,
                 behaviour_actions_log_probs,
                 target_actions_log_probs,
                 policy_entropy,
                 dones,
                 discount,
                 rewards,
                 values,
                 bootstrap_value,
                 entropy_coeff=-0.01,
                 vf_loss_coeff=0.5,
                 clip_rho_threshold=1.0,
                 clip_pg_rho_threshold=1.0):
        """Policy gradient loss with vtrace importance weighting.

        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
        batch_size. The reason we need to know `B` is for V-trace to properly
        handle episode cut boundaries.

        Args:
            behaviour_actions_log_probs: A float32 tensor of shape [T, B].
            target_actions_log_probs: A float32 tensor of shape [T, B].
            policy_entropy: A float32 tensor of shape [T, B].
            dones: A float32 tensor of shape [T, B].
            discount: A float32 scalar.
            rewards: A float32 tensor of shape [T, B].
            values: A float32 tensor of shape [T, B].
            bootstrap_value: A float32 tensor of shape [B].
        """

        self.vtrace_returns = from_importance_weights(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            discounts=inverse(dones) * discount,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold)

        # The policy gradients loss
        self.pi_loss = -1.0 * layers.reduce_sum(
            target_actions_log_probs * self.vtrace_returns.pg_advantages)

        # The baseline loss
        delta = values - self.vtrace_returns.vs
        self.vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))

        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
        self.entropy = layers.reduce_sum(policy_entropy)

        # The summed weighted loss
        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
                           self.entropy * entropy_coeff)
Esempio n. 5
0
    def entropy(self):
        """
        Returns:
            entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution.
        """
        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z
        entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)),
                                           dim=1)

        return entropy
Esempio n. 6
0
    def learn(self,
              obs,
              action,
              reward,
              next_obs,
              terminal,
              learning_rate=None):
        """ update value model self.model with DQN algorithm
        """
        # Support the modification of learning_rate
        if learning_rate is None:
            assert isinstance(
                self.lr,
                float), "Please set the learning rate of DQN in initializaion."
            learning_rate = self.lr

        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(
            learning_rate=learning_rate, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost
Esempio n. 7
0
    def _calc_kl(self, means, logvars, old_means, old_logvars):
        """ Calculate KL divergence between old and new distributions
            See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence

        Args:
            means: shape (batch_size, act_dim)
            logvars: shape (act_dim)
            old_means: shape (batch_size, act_dim)
            old_logvars: shape (act_dim)

        Returns:
            kl: shape (batch_size)
        """
        log_det_cov_old = layers.reduce_sum(old_logvars)
        log_det_cov_new = layers.reduce_sum(logvars)
        tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars))
        kl = 0.5 * (layers.reduce_sum(
            layers.square(means - old_means) / layers.exp(logvars), dim=1) +
                    (log_det_cov_new - log_det_cov_old) + tr_old_new -
                    self.act_dim)
        return kl
Esempio n. 8
0
    def _calc_logprob(self, actions, means, logvars):
        """ Calculate log probabilities of actions, when given means and logvars
            of normal distribution.
            The constant sqrt(2 * pi) is omitted, which will be eliminated in later.

        Args:
            actions: shape (batch_size, act_dim)
            means:   shape (batch_size, act_dim)
            logvars: shape (act_dim)

        Returns:
            logprob: shape (batch_size)
        """
        exp_item = layers.elementwise_div(layers.square(actions - means),
                                          layers.exp(logvars),
                                          axis=1)
        exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1)

        vars_item = -0.5 * layers.reduce_sum(logvars)
        logprob = exp_item + vars_item
        return logprob
Esempio n. 9
0
 def sample(self, obs):
     mean, log_std = self.actor.policy(obs)
     std = layers.exp(log_std)
     normal = Normal(mean, std)
     x_t = normal.sample([1])[0]
     y_t = layers.tanh(x_t)
     action = y_t * self.max_action
     log_prob = normal.log_prob(x_t)
     log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) +
                            epsilon)
     log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True)
     log_prob = layers.squeeze(log_prob, axes=[1])
     return action, log_prob
Esempio n. 10
0
    def learn(self, obs, actions, advantages, target_values, learning_rate,
              entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            advantages: A float32 tensor of shape [B].
            target_values: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """
        logits = self.model.policy(obs)
        policy_distribution = CategoricalDistribution(logits)
        actions_log_probs = policy_distribution.logp(actions)

        # The policy gradient loss
        pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages)

        # The value function loss
        values = self.model.value(obs)
        delta = values - target_values
        vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))

        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
        policy_entropy = policy_distribution.entropy()
        entropy = layers.reduce_sum(policy_entropy)

        total_loss = (pi_loss + vf_loss * self.vf_loss_coeff +
                      entropy * entropy_coeff)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(total_loss)

        return total_loss, pi_loss, vf_loss, entropy
Esempio n. 11
0
    def cal_bellman_residual(self, obs, action, reward, next_obs, terminal):
        """ use self.model to get squared Bellman residual with fed data
        """
        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        return cost
Esempio n. 12
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        """ update value model self.model with DQN algorithm
        """

        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta  # `delta` is the TD-error
Esempio n. 13
0
    def learn(self, obs, actions, means, log_std, rewards, dones,
              learning_rate, entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
            rewards: A float32 tensor of shape [B].
            dones: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """
        values = self.model.value(obs)
        # pi
        log_std = layers.exp(log_std)
        normal_pi = Normal(means, log_std)
        # x_t1 = normal_pi.sample([1])[0]
        # x_t1.stop_gradient = True
        y_t1 = actions / self.max_action
        # action1 = y_t1 * self.max_action
        log_prob1 = normal_pi.log_prob(actions)
        log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True)
        log_prob_pi = layers.squeeze(log_prob1, axes=[1])

        # mu
        actions_mu, log_std_mu = self.model.policy(obs)
        log_std_mu = layers.exp(log_std_mu)
        normal_mu = Normal(actions_mu, log_std_mu)
        # x_t2 = normal_mu.sample([1])[0]
        # x_t2.stop_gradient = True
        # y_t2 = actions
        # action2 = y_t2 * self.max_action
        log_prob2 = normal_mu.log_prob(actions)
        log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True)
        log_prob_mu = layers.squeeze(log_prob2, axes=[1])

        # target_policy_distribution = CategoricalDistribution(target_logits)
        # behaviour_policy_distribution = CategoricalDistribution(
        #     behaviour_logits)

        policy_entropy = normal_mu.entropy()
        # policy_entropy = layers.reduce_mean(policy_entropy, dim=1)
        target_actions_log_probs = log_prob_mu
        behaviour_actions_log_probs = log_prob_pi

        # Calculating kl for debug
        # kl = target_policy_distribution.kl(behaviour_policy_distribution)
        kl = normal_mu.kl_divergence(normal_pi)
        kl = layers.reduce_mean(kl, dim=1)
        # kl = layers.unsqueeze(kl, axes=[1])
        """
        Split the tensor into batches at known episode cut boundaries. 
        [B * T] -> [T, B]
        """
        T = self.sample_batch_steps

        def split_batches(tensor):
            B = tensor.shape[0] // T
            splited_tensor = layers.reshape(tensor,
                                            [B, T] + list(tensor.shape[1:]))
            # transpose B and T
            return layers.transpose(splited_tensor, [1, 0] +
                                    list(range(2, 1 + len(tensor.shape))))

        behaviour_actions_log_probs = split_batches(
            behaviour_actions_log_probs)
        target_actions_log_probs = split_batches(target_actions_log_probs)
        policy_entropy = split_batches(policy_entropy)
        dones = split_batches(dones)
        rewards = split_batches(rewards)
        values = split_batches(values)

        # [T, B] -> [T - 1, B] for V-trace calc.
        behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs,
                                                   axes=[0],
                                                   starts=[0],
                                                   ends=[-1])
        target_actions_log_probs = layers.slice(target_actions_log_probs,
                                                axes=[0],
                                                starts=[0],
                                                ends=[-1])
        policy_entropy = layers.slice(policy_entropy,
                                      axes=[0],
                                      starts=[0],
                                      ends=[-1])
        dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
        rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
        bootstrap_value = layers.slice(values,
                                       axes=[0],
                                       starts=[T - 1],
                                       ends=[T])
        values = layers.slice(values, axes=[0], starts=[0], ends=[-1])

        bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])

        vtrace_loss = VTraceLoss(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            policy_entropy=policy_entropy,
            dones=dones,
            discount=self.gamma,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            entropy_coeff=entropy_coeff,
            vf_loss_coeff=self.vf_loss_coeff,
            clip_rho_threshold=self.clip_rho_threshold,
            clip_pg_rho_threshold=self.clip_pg_rho_threshold)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(vtrace_loss.total_loss)
        return vtrace_loss, kl