def learn(self, obs, action, reward, next_obs, terminal, sample_weight): # print("obs:",obs) # raise NotImplementedError # obs = layers.squeeze(input=obs,axes=[-1]) pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) # calculate the target q value next_action_value = self.model.value(next_obs) greedy_action = layers.argmax(next_action_value, axis=-1) greedy_action = layers.unsqueeze(greedy_action, axes=[1]) greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim) next_pred_value = self.target_model.value(next_obs) max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value, dim=1) max_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta
def critic_learn(self, obs, action, reward, next_obs, terminal): noise = layers.gaussian_random_batch_size_like( action, shape=[-1, action.shape[1]]) noise = layers.clip(noise * self.policy_noise, min=-self.noise_clip, max=self.noise_clip) next_action = self.target_model.policy(next_obs) + noise next_action = layers.clip(next_action, -self.max_action, self.max_action) next_Q1, next_Q2 = self.target_model.value(next_obs, next_action) next_Q = layers.elementwise_min(next_Q1, next_Q2) terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * next_Q target_Q.stop_gradient = True current_Q1, current_Q2 = self.model.value(obs, action) cost = layers.square_error_cost(current_Q1, target_Q) + layers.square_error_cost( current_Q2, target_Q) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr) optimizer.minimize(cost) return cost
def _actor_learn(self, obs): action = self.model.policy(obs) Q = self.model.value(obs, action) cost = layers.reduce_mean(-1.0 * Q) optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr) optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) return cost
def learn(self, obs, action, reward, next_obs, terminal, learning_rate=None): """ update value model self.model with DQN algorithm """ # Support the modification of learning_rate if learning_rate is None: assert isinstance( self.lr, float), "Please set the learning rate of DQN in initializaion." learning_rate = self.lr pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum( layers.elementwise_mul(action_onehot, pred_value), dim=1) cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam( learning_rate=learning_rate, epsilon=1e-3) optimizer.minimize(cost) return cost
def value_learn(self, obs, val): """ Learn the value model with square error cost """ predict_val = self.model.value(obs) loss = layers.square_error_cost(predict_val, val) loss = layers.reduce_mean(loss) optimizer = fluid.optimizer.AdamOptimizer(self.value_lr) optimizer.minimize(loss) return loss
def learn(self, obs, action, reward): """ update policy model self.model with policy gradient algorithm """ act_prob = self.model(obs) log_prob = layers.cross_entropy(act_prob, action) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def actor_learn(self, obs): action, log_pi = self.sample(obs) qf1_pi, qf2_pi = self.critic.value(obs, action) min_qf_pi = layers.elementwise_min(qf1_pi, qf2_pi) cost = log_pi * self.alpha - min_qf_pi cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr) optimizer.minimize(cost, parameter_list=self.actor.parameters()) return cost
def _critic_learn(self, obs_n, act_n, target_q): pred_q = self.Q(obs_n, act_n) cost = layers.reduce_mean(layers.square_error_cost(pred_q, target_q)) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByNorm(clip_norm=0.5), param_list=self.model.get_critic_params()) optimizer = fluid.optimizer.AdamOptimizer(self.lr) optimizer.minimize(cost, parameter_list=self.model.get_critic_params()) return cost
def policy_learn(self, obs, actions, advantages, beta=None): """ Learn policy model with: 1. CLIP loss: Clipped Surrogate Objective 2. KLPEN loss: Adaptive KL Penalty Objective See: https://arxiv.org/pdf/1707.02286.pdf Args: obs: Tensor, (batch_size, obs_dim) actions: Tensor, (batch_size, act_dim) advantages: Tensor (batch_size, ) beta: Tensor (1) or None if None, use CLIP Loss; else, use KLPEN loss. """ old_means, old_logvars = self.old_policy_model.policy(obs) old_means.stop_gradient = True old_logvars.stop_gradient = True old_logprob = self._calc_logprob(actions, old_means, old_logvars) means, logvars = self.model.policy(obs) logprob = self._calc_logprob(actions, means, logvars) kl = self._calc_kl(means, logvars, old_means, old_logvars) kl = layers.reduce_mean(kl) if beta is None: # Clipped Surrogate Objective pg_ratio = layers.exp(logprob - old_logprob) clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon, 1 + self.epsilon) surrogate_loss = layers.elementwise_min( advantages * pg_ratio, advantages * clipped_pg_ratio) loss = 0 - layers.reduce_mean(surrogate_loss) else: # Adaptive KL Penalty Objective # policy gradient loss loss1 = 0 - layers.reduce_mean( advantages * layers.exp(logprob - old_logprob)) # adaptive kl loss loss2 = kl * beta loss = loss1 + loss2 optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr) optimizer.minimize(loss) return loss, kl
def _critic_learn(self, obs, action, reward, next_obs, terminal): next_action = self.target_model.policy(next_obs) next_Q = self.target_model.value(next_obs, next_action) terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * next_Q target_Q.stop_gradient = True Q = self.model.value(obs, action) cost = layers.square_error_cost(Q, target_Q) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr) optimizer.minimize(cost) return cost
def _actor_learn(self, obs_n, act_n): i = self.agent_index this_policy = self.model.policy(obs_n[i]) sample_this_action = SoftPDistribution( logits=this_policy, act_space=self.act_space[self.agent_index]).sample() action_input_n = act_n + [] action_input_n[i] = sample_this_action eval_q = self.Q(obs_n, action_input_n) act_cost = layers.reduce_mean(-1.0 * eval_q) act_reg = layers.reduce_mean(layers.square(this_policy)) cost = act_cost + act_reg * 1e-3 fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByNorm(clip_norm=0.5), param_list=self.model.get_actor_params()) optimizer = fluid.optimizer.AdamOptimizer(self.lr) optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) return cost
def cal_bellman_residual(self, obs, action, reward, next_obs, terminal): """ use self.model to get squared Bellman residual with fed data """ pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum( layers.elementwise_mul(action_onehot, pred_value), dim=1) cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) return cost
def critic_learn(self, obs, action, reward, next_obs, terminal): next_obs_action, next_obs_log_pi = self.sample(next_obs) qf1_next_target, qf2_next_target = self.target_critic.value( next_obs, next_obs_action) min_qf_next_target = layers.elementwise_min( qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target target_Q.stop_gradient = True current_Q1, current_Q2 = self.critic.value(obs, action) cost = layers.square_error_cost(current_Q1, target_Q) + layers.square_error_cost( current_Q2, target_Q) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward, next_obs, terminal, sample_weight): """ update value model self.model with DQN algorithm """ pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(action_onehot * pred_value, dim=1) delta = layers.abs(target - pred_action_value) cost = sample_weight * layers.square_error_cost( pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, delta # `delta` is the TD-error
def learn(self, obs, actions, means, log_std, rewards, dones, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS]. rewards: A float32 tensor of shape [B]. dones: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ values = self.model.value(obs) # pi log_std = layers.exp(log_std) normal_pi = Normal(means, log_std) # x_t1 = normal_pi.sample([1])[0] # x_t1.stop_gradient = True y_t1 = actions / self.max_action # action1 = y_t1 * self.max_action log_prob1 = normal_pi.log_prob(actions) log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True) log_prob_pi = layers.squeeze(log_prob1, axes=[1]) # mu actions_mu, log_std_mu = self.model.policy(obs) log_std_mu = layers.exp(log_std_mu) normal_mu = Normal(actions_mu, log_std_mu) # x_t2 = normal_mu.sample([1])[0] # x_t2.stop_gradient = True # y_t2 = actions # action2 = y_t2 * self.max_action log_prob2 = normal_mu.log_prob(actions) log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) + epsilon) log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True) log_prob_mu = layers.squeeze(log_prob2, axes=[1]) # target_policy_distribution = CategoricalDistribution(target_logits) # behaviour_policy_distribution = CategoricalDistribution( # behaviour_logits) policy_entropy = normal_mu.entropy() # policy_entropy = layers.reduce_mean(policy_entropy, dim=1) target_actions_log_probs = log_prob_mu behaviour_actions_log_probs = log_prob_pi # Calculating kl for debug # kl = target_policy_distribution.kl(behaviour_policy_distribution) kl = normal_mu.kl_divergence(normal_pi) kl = layers.reduce_mean(kl, dim=1) # kl = layers.unsqueeze(kl, axes=[1]) """ Split the tensor into batches at known episode cut boundaries. [B * T] -> [T, B] """ T = self.sample_batch_steps def split_batches(tensor): B = tensor.shape[0] // T splited_tensor = layers.reshape(tensor, [B, T] + list(tensor.shape[1:])) # transpose B and T return layers.transpose(splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape)))) behaviour_actions_log_probs = split_batches( behaviour_actions_log_probs) target_actions_log_probs = split_batches(target_actions_log_probs) policy_entropy = split_batches(policy_entropy) dones = split_batches(dones) rewards = split_batches(rewards) values = split_batches(values) # [T, B] -> [T - 1, B] for V-trace calc. behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1]) target_actions_log_probs = layers.slice(target_actions_log_probs, axes=[0], starts=[0], ends=[-1]) policy_entropy = layers.slice(policy_entropy, axes=[0], starts=[0], ends=[-1]) dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1]) rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.slice(values, axes=[0], starts=[T - 1], ends=[T]) values = layers.slice(values, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.squeeze(bootstrap_value, axes=[0]) vtrace_loss = VTraceLoss( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, policy_entropy=policy_entropy, dones=dones, discount=self.gamma, rewards=rewards, values=values, bootstrap_value=bootstrap_value, entropy_coeff=entropy_coeff, vf_loss_coeff=self.vf_loss_coeff, clip_rho_threshold=self.clip_rho_threshold, clip_pg_rho_threshold=self.clip_pg_rho_threshold) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(vtrace_loss.total_loss) return vtrace_loss, kl
def learn(self, obs, actions, behaviour_logits, rewards, dones, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS]. rewards: A float32 tensor of shape [B]. dones: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ values = self.model.value(obs) target_logits = self.model.policy(obs) target_policy_distribution = CategoricalDistribution(target_logits) behaviour_policy_distribution = CategoricalDistribution( behaviour_logits) policy_entropy = target_policy_distribution.entropy() target_actions_log_probs = target_policy_distribution.logp(actions) behaviour_actions_log_probs = behaviour_policy_distribution.logp( actions) # Calculating kl for debug kl = target_policy_distribution.kl(behaviour_policy_distribution) kl = layers.reduce_mean(kl) """ Split the tensor into batches at known episode cut boundaries. [B * T] -> [T, B] """ T = self.sample_batch_steps def split_batches(tensor): B = tensor.shape[0] // T splited_tensor = layers.reshape(tensor, [B, T] + list(tensor.shape[1:])) # transpose B and T return layers.transpose(splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape)))) behaviour_actions_log_probs = split_batches( behaviour_actions_log_probs) target_actions_log_probs = split_batches(target_actions_log_probs) policy_entropy = split_batches(policy_entropy) dones = split_batches(dones) rewards = split_batches(rewards) values = split_batches(values) # [T, B] -> [T - 1, B] for V-trace calc. behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1]) target_actions_log_probs = layers.slice(target_actions_log_probs, axes=[0], starts=[0], ends=[-1]) policy_entropy = layers.slice(policy_entropy, axes=[0], starts=[0], ends=[-1]) dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1]) rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.slice(values, axes=[0], starts=[T - 1], ends=[T]) values = layers.slice(values, axes=[0], starts=[0], ends=[-1]) bootstrap_value = layers.squeeze(bootstrap_value, axes=[0]) vtrace_loss = VTraceLoss( behaviour_actions_log_probs=behaviour_actions_log_probs, target_actions_log_probs=target_actions_log_probs, policy_entropy=policy_entropy, dones=dones, discount=self.gamma, rewards=rewards, values=values, bootstrap_value=bootstrap_value, entropy_coeff=entropy_coeff, vf_loss_coeff=self.vf_loss_coeff, clip_rho_threshold=self.clip_rho_threshold, clip_pg_rho_threshold=self.clip_pg_rho_threshold) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(vtrace_loss.total_loss) return vtrace_loss, kl