def kl(self, other): """ Args: other: object of CategoricalDistribution Returns: kl: A float32 tensor with shape [BATCH_SIZE] """ assert isinstance(other, CategoricalDistribution) logits = self.logits - layers.reduce_max(self.logits, dim=1) other_logits = other.logits - layers.reduce_max(other.logits, dim=1) e_logits = layers.exp(logits) other_e_logits = layers.exp(other_logits) z = layers.reduce_sum(e_logits, dim=1) other_z = layers.reduce_sum(other_e_logits, dim=1) prob = e_logits / z kl = layers.reduce_sum( prob * (logits - layers.log(z) - other_logits + layers.log(other_z)), dim=1) return kl
def logp(self, actions, eps=1e-6): """ Args: actions: An int64 tensor with shape [BATCH_SIZE] eps: A small float constant that avoids underflows when computing the log probability Returns: actions_log_prob: A float32 tensor with shape [BATCH_SIZE] """ assert len(actions.shape) == 1 logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z actions = layers.unsqueeze(actions, axes=[1]) actions_onehot = layers.one_hot(actions, prob.shape[1]) actions_onehot = layers.cast(actions_onehot, dtype='float32') actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1) actions_prob = actions_prob + eps actions_log_prob = layers.log(actions_prob) return actions_log_prob
def learn(self, obs, action, reward, next_obs, terminal, learning_rate=None): """ update value model self.model with DQN algorithm """ # Support the modification of learning_rate if learning_rate is None: assert isinstance( self.lr, float), "Please set the learning rate of DQN in initializaion." learning_rate = self.lr pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum( layers.elementwise_mul(action_onehot, pred_value), dim=1) cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam( learning_rate=learning_rate, epsilon=1e-3) optimizer.minimize(cost) return cost
def entropy(self): """ Returns: entropy: A float32 tensor with shape [BATCH_SIZE] of entropy of self policy distribution. """ logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z entropy = -1.0 * layers.reduce_sum(prob * (logits - layers.log(z)), dim=1) return entropy
def cal_bellman_residual(self, obs, action, reward, next_obs, terminal): """ use self.model to get squared Bellman residual with fed data """ pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum( layers.elementwise_mul(action_onehot, pred_value), dim=1) cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) return cost
def learn(self, obs, action, reward, next_obs, terminal, sample_weight): """ update value model self.model with DQN algorithm """ pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta # `delta` is the TD-error