def kl(self, other): """ Args: other: object of CategoricalDistribution Returns: kl: A float32 tensor with shape [BATCH_SIZE] """ assert isinstance(other, CategoricalDistribution) logits = self.logits - layers.reduce_max(self.logits, dim=1) other_logits = other.logits - layers.reduce_max(other.logits, dim=1) e_logits = layers.exp(logits) other_e_logits = layers.exp(other_logits) z = layers.reduce_sum(e_logits, dim=1) other_z = layers.reduce_sum(other_e_logits, dim=1) prob = e_logits / z kl = layers.reduce_sum( prob * (logits - layers.log(z) - other_logits + layers.log(other_z)), dim=1) return kl
def logp(self, actions, eps=1e-6): """ Args: actions: An int64 tensor with shape [BATCH_SIZE] eps: A small float constant that avoids underflows when computing the log probability Returns: actions_log_prob: A float32 tensor with shape [BATCH_SIZE] """ assert len(actions.shape) == 1 logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z actions = layers.unsqueeze(actions, axes=[1]) actions_onehot = layers.one_hot(actions, prob.shape[1]) actions_onehot = layers.cast(actions_onehot, dtype='float32') actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1) actions_prob = actions_prob + eps actions_log_prob = layers.log(actions_prob) return actions_log_prob
def policy_learn(self, obs, actions, advantages, beta=None): """ Learn policy model with: 1. CLIP loss: Clipped Surrogate Objective 2. KLPEN loss: Adaptive KL Penalty Objective See: https://arxiv.org/pdf/1707.02286.pdf Args: obs: Tensor, (batch_size, obs_dim) actions: Tensor, (batch_size, act_dim) advantages: Tensor (batch_size, ) beta: Tensor (1) or None if None, use CLIP Loss; else, use KLPEN loss. """ old_means, old_logvars = self.old_policy_model.policy(obs) old_means.stop_gradient = True old_logvars.stop_gradient = True old_logprob = self._calc_logprob(actions, old_means, old_logvars) means, logvars = self.model.policy(obs) logprob = self._calc_logprob(actions, means, logvars) kl = self._calc_kl(means, logvars, old_means, old_logvars) kl = layers.reduce_mean(kl) if beta is None: # Clipped Surrogate Objective pg_ratio = layers.exp(logprob - old_logprob) clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon, 1 + self.epsilon) surrogate_loss = layers.elementwise_min( advantages * pg_ratio, advantages * clipped_pg_ratio) loss = 0 - layers.reduce_mean(surrogate_loss) else: # Adaptive KL Penalty Objective # policy gradient loss loss1 = 0 - layers.reduce_mean( advantages * layers.exp(logprob - old_logprob)) # adaptive kl loss loss2 = kl * beta loss = loss1 + loss2 optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr) optimizer.minimize(loss) return loss, kl
def _calc_kl(self, means, logvars, old_means, old_logvars): """ Calculate KL divergence between old and new distributions See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence Args: means: shape (batch_size, act_dim) logvars: shape (act_dim) old_means: shape (batch_size, act_dim) old_logvars: shape (act_dim) Returns: kl: shape (batch_size) """ log_det_cov_old = layers.reduce_sum(old_logvars) log_det_cov_new = layers.reduce_sum(logvars) tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars)) kl = 0.5 * (layers.reduce_sum( layers.square(means - old_means) / layers.exp(logvars), dim=1) + (log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim) return kl
def sample(self, obs): mean, log_std = self.model.policy(obs) std = layers.exp(log_std) normal = Normal(mean, std) x_t = normal.sample([1])[0] y_t = layers.tanh(x_t) action = y_t * self.max_action # log_prob = normal.log_prob(x_t) # log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) + epsilon) # log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True) # log_prob = layers.squeeze(log_prob, axes=[1]) return action, mean, log_std
def entropy(self): """ Returns: entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution. """ logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)), dim=1) return entropy
def _calc_logprob(self, actions, means, logvars): """ Calculate log probabilities of actions, when given means and logvars of normal distribution. The constant sqrt(2 * pi) is omitted, which will be eliminated in later. Args: actions: shape (batch_size, act_dim) means: shape (batch_size, act_dim) logvars: shape (act_dim) Returns: logprob: shape (batch_size) """ exp_item = layers.elementwise_div(layers.square(actions - means), layers.exp(logvars), axis=1) exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1) vars_item = -0.5 * layers.reduce_sum(logvars) logprob = exp_item + vars_item return logprob
def from_importance_weights(behaviour_actions_log_probs, target_actions_log_probs, discounts, rewards, values, bootstrap_value, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name='vtrace_from_logits'): r"""V-trace for softmax policies. Calculates V-trace actor critic targets for softmax polices as described in "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures" by Espeholt, Soyer, Munos et al. Target policy refers to the policy we are interested in improving and behaviour policy refers to the policy that generated the given rewards and actions. In the notation used throughout documentation and comments, T refers to the time dimension ranging from 0 to T-1. B refers to the batch size and NUM_ACTIONS refers to the number of actions. Args: behaviour_actions_log_probs: A float32 tensor of shape [T, B] of log-probabilities of actions in behaviour policy. target_policy_logits: A float32 tensor of shape [T, B] of log-probabilities of actions in target policy. discounts: A float32 tensor of shape [T, B] with the discount encountered when following the behaviour policy. rewards: A float32 tensor of shape [T, B] with the rewards generated by following the behaviour policy. values: A float32 tensor of shape [T, B] with the value function estimates wrt. the target policy. bootstrap_value: A float32 of shape [B] with the value function estimate at time T. clip_rho_threshold: A scalar float32 tensor with the clipping threshold for importance weights (rho) when calculating the baseline targets (vs). rho^bar in the paper. clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). name: The name scope that all V-trace operations will be created in. Returns: A VTraceReturns namedtuple (vs, pg_advantages) where: vs: A float32 tensor of shape [T, B]. Can be used as target to train a baseline (V(x_t) - vs_t)^2. pg_advantages: A float32 tensor of shape [T, B]. Can be used as the advantage in the calculation of policy gradients. """ # rank = len(behaviour_actions_log_probs.shape) # Usually 2. # assert len(target_actions_log_probs.shape) == rank # assert len(values.shape) == rank # assert len(bootstrap_value.shape) == (rank - 1) # assert len(discounts.shape) == rank # assert len(rewards.shape) == rank # log importance sampling weights. # V-trace performs operations on rhos in log-space for numerical stability. log_rhos = behaviour_actions_log_probs - target_actions_log_probs if clip_rho_threshold is not None: clip_rho_threshold = layers.fill_constant([1], 'float32', clip_rho_threshold) if clip_pg_rho_threshold is not None: clip_pg_rho_threshold = layers.fill_constant([1], 'float32', clip_pg_rho_threshold) rhos = layers.exp(log_rhos) if clip_rho_threshold is not None: clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold) else: clipped_rhos = rhos constant_one = layers.fill_constant([1], 'float32', 1.0) cs = layers.elementwise_min(rhos, constant_one) # Append bootstrapped value to get [v1, ..., v_t+1] values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32]) values_t_plus_1 = layers.concat( [values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0) # \delta_s * V deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) vs_minus_v_xs = recursively_scan(discounts, cs, deltas) # Add V(x_s) to get v_s. vs = layers.elementwise_add(vs_minus_v_xs, values) # Advantage for policy gradient. vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32]) vs_t_plus_1 = layers.concat( [vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0) if clip_pg_rho_threshold is not None: clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold) else: clipped_pg_rhos = rhos pg_advantages = (clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)) # Make sure no gradients backpropagated through the returned values. vs.stop_gradient = True pg_advantages.stop_gradient = True return VTraceReturns(vs=vs, pg_advantages=pg_advantages)
def learn(self, obs, actions, means, log_std, rewards, dones, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS]. rewards: A float32 tensor of shape [B]. dones: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ values = self.model.value(obs) # pi log_std = layers.exp(log_std) normal_pi = Normal(means, log_std) # x_t1 = normal_pi.sample([1])[0] # x_t1.stop_gradient = True y_t1 = actions / self.max_action # action1 = y_t1 * self.max_action log_prob1 = normal_pi.log_prob(actions) log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True) log_prob_pi = layers.squeeze(log_prob1, axes=[1]) # mu actions_mu, log_std_mu = self.model.policy(obs) log_std_mu = layers.exp(log_std_mu) normal_mu = Normal(actions_mu, log_std_mu) # x_t2 = normal_mu.sample([1])[0] # x_t2.stop_gradient = True # y_t2 = actions # action2 = y_t2 * self.max_action log_prob2 = normal_mu.log_prob(actions) log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True) log_prob_mu = layers.squeeze(log_prob2, axes=[1]) # target_policy_distribution = CategoricalDistribution(target_logits) # behaviour_policy_distribution = CategoricalDistribution( # behaviour_logits) policy_entropy = normal_mu.entropy() # policy_entropy = layers.reduce_mean(policy_entropy, dim=1) target_actions_log_probs = log_prob_mu behaviour_actions_log_probs = log_prob_pi # Calculating kl for debug # kl = target_policy_distribution.kl(behaviour_policy_distribution) kl = normal_mu.kl_divergence(normal_pi) kl = layers.reduce_mean(kl, dim=1) # kl = layers.unsqueeze(kl, axes=[1]) """ Split the tensor into batches at known episode cut boundaries. [B * T] -> [T, B] """ T = self.sample_batch_steps def split_batches(tensor): B = tensor.shape[0] // T splited_tensor = layers.reshape(tensor, [B, T] + list(tensor.shape[1:])) # transpose B and T return layers.transpose(splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape)))) behaviour_actions_log_probs = split_batches( behaviour_actions_log_probs) target_actions_log_probs = split_batches(target_actions_log_probs) policy_entropy = split_batches(policy_entropy) dones = split_batches(dones) rewards = split_batches(rewards) values = split_batches(values) # [T, B] -> [T - 1, B] for V-trace calc. behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1]) target_actions_log_probs = layers.slice(target_actions_log_probs, axes=[0], starts=[0], ends=[-1]) policy_entropy = layers.slice(policy_entropy, axes=[0], starts=[0], ends=[-1]) dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1]) rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.slice(values, axes=[0], starts=[T - 1], ends=[T]) values = layers.slice(values, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.squeeze(bootstrap_value, axes=[0]) vtrace_loss = VTraceLoss( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, policy_entropy=policy_entropy, dones=dones, discount=self.gamma, rewards=rewards, values=values, bootstrap_value=bootstrap_value, entropy_coeff=entropy_coeff, vf_loss_coeff=self.vf_loss_coeff, clip_rho_threshold=self.clip_rho_threshold, clip_pg_rho_threshold=self.clip_pg_rho_threshold) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(vtrace_loss.total_loss) return vtrace_loss, kl