def logp(self, actions, eps=1e-6): """ Args: actions: An int64 tensor with shape [BATCH_SIZE] eps: A small float constant that avoids underflows when computing the log probability Returns: actions_log_prob: A float32 tensor with shape [BATCH_SIZE] """ assert len(actions.shape) == 1 logits = self.logits - layers.reduce_max(self.logits, dim=1) e_logits = layers.exp(logits) z = layers.reduce_sum(e_logits, dim=1) prob = e_logits / z actions = layers.unsqueeze(actions, axes=[1]) actions_onehot = layers.one_hot(actions, prob.shape[1]) actions_onehot = layers.cast(actions_onehot, dtype='float32') actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1) actions_prob = actions_prob + eps actions_log_prob = layers.log(actions_prob) return actions_log_prob
def learn(self, obs, action, reward, next_obs, terminal, sample_weight): # print("obs:",obs) # raise NotImplementedError # obs = layers.squeeze(input=obs,axes=[-1]) pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) # calculate the target q value next_action_value = self.model.value(next_obs) greedy_action = layers.argmax(next_action_value, axis=-1) greedy_action = layers.unsqueeze(greedy_action, axes=[1]) greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim) next_pred_value = self.target_model.value(next_obs) max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value, dim=1) max_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta
def learn(self, obs, action, reward, next_obs, terminal, learning_rate=None): """ update value model self.model with DQN algorithm """ # Support the modification of learning_rate if learning_rate is None: assert isinstance( self.lr, float), "Please set the learning rate of DQN in initializaion." learning_rate = self.lr pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) # calculate the target q value next_action_value = self.model.value(next_obs) greedy_action = layers.argmax(next_action_value, axis=-1) greedy_action = layers.unsqueeze(greedy_action, axes=[1]) greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim) next_pred_value = self.target_model.value(next_obs) max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value, dim=1) max_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, epsilon=1e-3) optimizer.minimize(cost) return cost
def from_importance_weights(behaviour_actions_log_probs, target_actions_log_probs, discounts, rewards, values, bootstrap_value, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name='vtrace_from_logits'): r"""V-trace for softmax policies. Calculates V-trace actor critic targets for softmax polices as described in "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures" by Espeholt, Soyer, Munos et al. Target policy refers to the policy we are interested in improving and behaviour policy refers to the policy that generated the given rewards and actions. In the notation used throughout documentation and comments, T refers to the time dimension ranging from 0 to T-1. B refers to the batch size and NUM_ACTIONS refers to the number of actions. Args: behaviour_actions_log_probs: A float32 tensor of shape [T, B] of log-probabilities of actions in behaviour policy. target_policy_logits: A float32 tensor of shape [T, B] of log-probabilities of actions in target policy. discounts: A float32 tensor of shape [T, B] with the discount encountered when following the behaviour policy. rewards: A float32 tensor of shape [T, B] with the rewards generated by following the behaviour policy. values: A float32 tensor of shape [T, B] with the value function estimates wrt. the target policy. bootstrap_value: A float32 of shape [B] with the value function estimate at time T. clip_rho_threshold: A scalar float32 tensor with the clipping threshold for importance weights (rho) when calculating the baseline targets (vs). rho^bar in the paper. clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). name: The name scope that all V-trace operations will be created in. Returns: A VTraceReturns namedtuple (vs, pg_advantages) where: vs: A float32 tensor of shape [T, B]. Can be used as target to train a baseline (V(x_t) - vs_t)^2. pg_advantages: A float32 tensor of shape [T, B]. Can be used as the advantage in the calculation of policy gradients. """ # rank = len(behaviour_actions_log_probs.shape) # Usually 2. # assert len(target_actions_log_probs.shape) == rank # assert len(values.shape) == rank # assert len(bootstrap_value.shape) == (rank - 1) # assert len(discounts.shape) == rank # assert len(rewards.shape) == rank # log importance sampling weights. # V-trace performs operations on rhos in log-space for numerical stability. log_rhos = behaviour_actions_log_probs - target_actions_log_probs if clip_rho_threshold is not None: clip_rho_threshold = layers.fill_constant([1], 'float32', clip_rho_threshold) if clip_pg_rho_threshold is not None: clip_pg_rho_threshold = layers.fill_constant([1], 'float32', clip_pg_rho_threshold) rhos = layers.exp(log_rhos) if clip_rho_threshold is not None: clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold) else: clipped_rhos = rhos constant_one = layers.fill_constant([1], 'float32', 1.0) cs = layers.elementwise_min(rhos, constant_one) # Append bootstrapped value to get [v1, ..., v_t+1] values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32]) values_t_plus_1 = layers.concat( [values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0) # \delta_s * V deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) vs_minus_v_xs = recursively_scan(discounts, cs, deltas) # Add V(x_s) to get v_s. vs = layers.elementwise_add(vs_minus_v_xs, values) # Advantage for policy gradient. vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32]) vs_t_plus_1 = layers.concat( [vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0) if clip_pg_rho_threshold is not None: clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold) else: clipped_pg_rhos = rhos pg_advantages = (clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)) # Make sure no gradients backpropagated through the returned values. vs.stop_gradient = True pg_advantages.stop_gradient = True return VTraceReturns(vs=vs, pg_advantages=pg_advantages)