def train(self, memories): s, visual_s, a, dc_r, cell_state = memories with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, _ = self._representation_net(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, log_std = self.net.policy_net(feat) log_act_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.net.policy_net(feat) logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) v = self.net.value_net(feat) advantage = tf.stop_gradient(dc_r - v) td_error = dc_r - v critic_loss = tf.reduce_mean(tf.square(td_error)) actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy) critic_grads = tape.gradient(critic_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.net.critic_trainable_variables) ) if self.is_continuous: actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables) ) else: actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables) ) self.global_step.assign_add(1) return actor_loss, critic_loss, entropy
def train_actor(self, memories): s, visual_s, a, old_log_prob, advantage, cell_state = memories with tf.device(self.device): with tf.GradientTape() as tape: output, _ = self.net(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, log_std = output new_log_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = output logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) actor_loss = -tf.reduce_mean(ratio * advantage) actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) gradients = flat_concat(actor_grads) self.global_step.assign_add(1) return actor_loss, entropy, gradients
def train(self, memories): s, visual_s, a, dc_r, cell_state = memories with tf.device(self.device): with tf.GradientTape() as tape: output, cell_state = self.net(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, log_std = output log_act_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = output logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) loss = -tf.reduce_mean(log_act_prob * dc_r) loss_grads = tape.gradient(loss, self.net.trainable_variables) self.optimizer.apply_gradients( zip(loss_grads, self.net.trainable_variables)) self.global_step.assign_add(1) return loss, entropy
def _train(self, memories, isw, cell_state): ss, vvss, a, r, done, old_log_prob = memories with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: (feat, feat_), _ = self._representation_net(ss, vvss, cell_state=cell_state, need_split=True) if self.is_continuous: mu, log_std = self.net.policy_net(feat) log_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) next_mu, _ = self.net.policy_net(feat_) max_q_next = tf.stop_gradient( self.net.value_net(feat_, next_mu)) else: logits = self.net.policy_net(feat) logp_all = tf.nn.log_softmax(logits) log_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) logits = self.net.policy_net(feat_) max_a = tf.argmax(logits, axis=1) max_a_one_hot = tf.one_hot(max_a, self.a_dim) max_q_next = tf.stop_gradient( self.net.value_net(feat_, max_a_one_hot)) q = self.net.value_net(feat, a) ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob)) q_value = tf.stop_gradient(q) td_error = q - (r + self.gamma * (1 - done) * max_q_next) critic_loss = tf.reduce_mean(tf.square(td_error) * isw) actor_loss = -tf.reduce_mean(ratio * log_prob * q_value) critic_grads = tape.gradient(critic_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.net.critic_trainable_variables)) actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_max', tf.reduce_max(q)], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/ratio', tf.reduce_mean(ratio)], ['Statistics/entropy', entropy]])
def train_share(self, memories, kl_coef, crsty_loss, cell_state): s, visual_s, a, dc_r, old_log_prob, advantage, old_value = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, value = self.net(feat) new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits, value = self.net(feat) logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) # https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40 if self.kl_reverse: kl = tf.reduce_mean(new_log_prob - old_log_prob) else: kl = tf.reduce_mean( old_log_prob - new_log_prob ) # a sample estimate for KL-divergence, easy to compute surrogate = ratio * advantage # https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154 value_clip = old_value + tf.clip_by_value( value - old_value, -self.value_epsilon, self.value_epsilon) td_error = dc_r - value td_error_clip = dc_r - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) pi_loss = -tf.reduce_mean( tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * advantage)) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square( tf.maximum(0., kl - self.kl_cutoff)) actor_loss = pi_loss + kl_loss + extra_loss value_loss = 0.5 * tf.reduce_mean(td_square) loss = actor_loss + 1.0 * value_loss - self.beta * entropy + crsty_loss loss_grads = tape.gradient(loss, self.net_tv) self.optimizer.apply_gradients(zip(loss_grads, self.net_tv)) self.global_step.assign_add(1) return actor_loss, value_loss, entropy, kl
def train_actor(self, BATCH, cell_state, kl_coef): with tf.device(self.device): with tf.GradientTape() as tape: output, _ = self.net(BATCH.obs, cell_state=cell_state) if self.is_continuous: mu, log_std = output new_log_prob = gaussian_likelihood_sum( BATCH.action, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = output logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(BATCH.action * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - BATCH.log_prob) kl = tf.reduce_mean(BATCH.log_prob - new_log_prob) surrogate = ratio * BATCH.gae_adv clipped_surrogate = tf.minimum( surrogate, tf.where(BATCH.gae_adv > 0, (1 + self.epsilon) * BATCH.gae_adv, (1 - self.epsilon) * BATCH.gae_adv)) if self.use_duel_clip: clipped_surrogate = tf.maximum(clipped_surrogate, (1.0 + self.duel_epsilon) * BATCH.gae_adv) actor_loss = -(tf.reduce_mean(clipped_surrogate) + self.ent_coef * entropy) if self.use_kl_loss: kl_loss = kl_coef * kl actor_loss += kl_loss if self.use_extra_loss: extra_loss = self.extra_coef * tf.square( tf.maximum(0., kl - self.kl_cutoff)) actor_loss += extra_loss actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) self.global_step.assign_add(1) return actor_loss, entropy, kl
def train_actor(self, memories, kl_coef, cell_state): s, visual_s, a, old_log_prob, advantage = memories with tf.device(self.device): feat = self.get_feature(s, visual_s, cell_state=cell_state) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) kl = tf.reduce_mean(old_log_prob - new_log_prob) surrogate = ratio * advantage min_adv = tf.where(advantage > 0, (1 + self.epsilon) * advantage, (1 - self.epsilon) * advantage) pi_loss = -(tf.reduce_mean(tf.minimum(surrogate, min_adv)) + self.beta * entropy) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square( tf.maximum(0., kl - self.kl_cutoff)) actor_loss = pi_loss + kl_loss + extra_loss actor_grads = tape.gradient(actor_loss, self.actor_net_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net_tv)) self.global_step.assign_add(1) return actor_loss, entropy, kl
def train(self, memories, crsty_loss, cell_state): s, visual_s, a, dc_r = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) v = self.critic_net(feat) td_error = dc_r - v critic_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) v = self.critic_net(feat) advantage = tf.stop_gradient(dc_r - v) actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy) if self.is_continuous: actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) else: actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) self.global_step.assign_add(1) return actor_loss, critic_loss, entropy
def _train(self, memories, isw, cell_state): s, visual_s, a, r, s_, visual_s_, done, last_options, options = memories last_options = tf.cast(last_options, tf.int32) options = tf.cast(options, tf.int32) with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, _ = self._representation_net(s, visual_s, cell_state=cell_state) feat_, _ = self._representation_target_net( s_, visual_s_, cell_state=cell_state) q = self.q_net.value_net(feat) # [B, P] pi = self.intra_option_net.value_net(feat) # [B, P, A] beta = self.termination_net.value_net(feat) # [B, P] q_next = self.q_target_net.value_net( feat_) # [B, P], [B, P, A], [B, P] beta_next = self.termination_net.value_net(feat_) # [B, P] interests = self.interest_net.value_net(feat) # [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B,] => [B, P] q_s = qu_eval = tf.reduce_sum(q * options_onehot, axis=-1, keepdims=True) # [B, 1] beta_s_ = tf.reduce_sum(beta_next * options_onehot, axis=-1, keepdims=True) # [B, 1] q_s_ = tf.reduce_sum(q_next * options_onehot, axis=-1, keepdims=True) # [B, 1] if self.double_q: q_ = self.q_net.value_net( feat) # [B, P], [B, P, A], [B, P] max_a_idx = tf.one_hot( tf.argmax(q_, axis=-1), self.options_num, dtype=tf.float32) # [B, P] => [B, ] => [B, P] q_s_max = tf.reduce_sum(q_next * max_a_idx, axis=-1, keepdims=True) # [B, 1] else: q_s_max = tf.reduce_max(q_next, axis=-1, keepdims=True) # [B, 1] u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max # [B, 1] qu_target = tf.stop_gradient(r + self.gamma * (1 - done) * u_target) td_error = qu_target - qu_eval # gradient : q q_loss = tf.reduce_mean(tf.square(td_error) * isw) # [B, 1] => 1 if self.use_baseline: adv = tf.stop_gradient(qu_target - qu_eval) else: adv = tf.stop_gradient(qu_target) options_onehot_expanded = tf.expand_dims( options_onehot, axis=-1) # [B, P] => [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = tf.math.tanh(pi) log_p = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: pi = pi / self.boltzmann_temperature log_pi = tf.nn.log_softmax(pi, axis=-1) # [B, A] entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi, axis=1, keepdims=True) # [B, 1] log_p = tf.reduce_sum(a * log_pi, axis=-1, keepdims=True) # [B, 1] pi_loss = tf.reduce_mean( -(log_p * adv + self.ent_coff * entropy) ) # [B, 1] * [B, 1] => [B, 1] => 1 last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] pi_op = tf.nn.softmax( interests * tf.stop_gradient(q)) # [B, P] or tf.nn.softmax(q) interest_loss = -tf.reduce_mean(beta_s * tf.reduce_sum( pi_op * options_onehot, axis=-1, keepdims=True) * q_s) # [B, 1] => 1 v_s = tf.reduce_sum(q * pi_op, axis=-1, keepdims=True) # [B, P] * [B, P] => [B, 1] beta_loss = beta_s * tf.stop_gradient(q_s - v_s) # [B, 1] if self.terminal_mask: beta_loss *= (1 - done) beta_loss = tf.reduce_mean(beta_loss) # [B, 1] => 1 q_grads = tape.gradient(q_loss, self.q_net.trainable_variables) intra_option_grads = tape.gradient(pi_loss, self.actor_tv) termination_grads = tape.gradient( beta_loss, self.termination_net.trainable_variables) interest_grads = tape.gradient( interest_loss, self.interest_net.trainable_variables) self.q_optimizer.apply_gradients( zip(q_grads, self.q_net.trainable_variables)) self.intra_option_optimizer.apply_gradients( zip(intra_option_grads, self.actor_tv)) self.termination_optimizer.apply_gradients( zip(termination_grads, self.termination_net.trainable_variables)) self.interest_optimizer.apply_gradients( zip(interest_grads, self.interest_net.trainable_variables)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/q_loss', tf.reduce_mean(q_loss)], ['LOSS/pi_loss', tf.reduce_mean(pi_loss)], ['LOSS/beta_loss', tf.reduce_mean(beta_loss)], ['LOSS/interest_loss', tf.reduce_mean(interest_loss)], ['Statistics/q_option_max', tf.reduce_max(q_s)], ['Statistics/q_option_min', tf.reduce_min(q_s)], ['Statistics/q_option_mean', tf.reduce_mean(q_s)]])
def train_continuous(self, BATCH, isw, cell_state): with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, _ = self._representation_net(BATCH.obs, cell_state=cell_state) v = self.v_net.value_net(feat) v_target, _ = self.v_target_net(BATCH.obs_, cell_state=cell_state) if self.is_continuous: mu, log_std = self.actor_net.value_net(feat) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net.value_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( BATCH.action.shape), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q1, q2 = self.q_net.get_value(feat, BATCH.action) q1_pi, q2_pi = self.q_net.get_value(feat, pi) dc_r = tf.stop_gradient(BATCH.reward + self.gamma * v_target * (1 - BATCH.done)) v_from_q_stop = tf.stop_gradient( tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi) td_v = v - v_from_q_stop td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) critic_grads = tape.gradient( critic_loss, self.q_net.trainable_variables + self.v_net.trainable_variables) self.optimizer_critic.apply_gradients( zip( critic_grads, self.q_net.trainable_variables + self.v_net.trainable_variables)) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))], ['Statistics/v_mean', tf.reduce_mean(v)]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries
def _train(self, memories, isw, cell_state): s, visual_s, a, r, s_, visual_s_, done, last_options, options = memories last_options = tf.cast(last_options, tf.int32) options = tf.cast(options, tf.int32) with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, _ = self._representation_net(s, visual_s, cell_state=cell_state) feat_, _ = self._representation_target_net( s_, visual_s_, cell_state=cell_state) q = self.q_net.value_net(feat) # [B, P] pi = self.intra_option_net.value_net(feat) # [B, P, A] beta = self.termination_net.value_net(feat) # [B, P] q_next = self.q_target_net.value_net( feat_) # [B, P], [B, P, A], [B, P] beta_next = self.termination_net.value_net(feat_) # [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B,] => [B, P] q_s = qu_eval = tf.reduce_sum(q * options_onehot, axis=-1, keepdims=True) # [B, 1] beta_s_ = tf.reduce_sum(beta_next * options_onehot, axis=-1, keepdims=True) # [B, 1] q_s_ = tf.reduce_sum(q_next * options_onehot, axis=-1, keepdims=True) # [B, 1] # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L94 if self.double_q: q_ = self.q_net.value_net( feat) # [B, P], [B, P, A], [B, P] max_a_idx = tf.one_hot( tf.argmax(q_, axis=-1), self.options_num, dtype=tf.float32) # [B, P] => [B, ] => [B, P] q_s_max = tf.reduce_sum(q_next * max_a_idx, axis=-1, keepdims=True) # [B, 1] else: q_s_max = tf.reduce_max(q_next, axis=-1, keepdims=True) # [B, 1] u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max # [B, 1] qu_target = tf.stop_gradient(r + self.gamma * (1 - done) * u_target) td_error = qu_target - qu_eval # gradient : q q_loss = tf.reduce_mean(tf.square(td_error) * isw) # [B, 1] => 1 # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L130 if self.use_baseline: adv = tf.stop_gradient(qu_target - qu_eval) else: adv = tf.stop_gradient(qu_target) options_onehot_expanded = tf.expand_dims( options_onehot, axis=-1) # [B, P] => [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = tf.math.tanh(pi) log_p = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: pi = pi / self.boltzmann_temperature log_pi = tf.nn.log_softmax(pi, axis=-1) # [B, A] entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi, axis=1, keepdims=True) # [B, 1] log_p = tf.reduce_sum(a * log_pi, axis=-1, keepdims=True) # [B, 1] pi_loss = tf.reduce_mean( -(log_p * adv + self.ent_coff * entropy) ) # [B, 1] * [B, 1] => [B, 1] => 1 last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] if self.use_eps_greedy: v_s = tf.reduce_max( q, axis=-1, keepdims=True) - self.termination_regularizer # [B, 1] else: v_s = (1 - beta_s) * q_s + beta_s * tf.reduce_max( q, axis=-1, keepdims=True) # [B, 1] # v_s = tf.reduce_mean(q, axis=-1, keepdims=True) # [B, 1] beta_loss = beta_s * tf.stop_gradient(q_s - v_s) # [B, 1] # https://github.com/lweitkamp/option-critic-pytorch/blob/0c57da7686f8903ed2d8dded3fae832ee9defd1a/option_critic.py#L238 if self.terminal_mask: beta_loss *= (1 - done) beta_loss = tf.reduce_mean(beta_loss) # [B, 1] => 1 q_grads = tape.gradient(q_loss, self.q_net.trainable_variables) intra_option_grads = tape.gradient(pi_loss, self.actor_tv) termination_grads = tape.gradient( beta_loss, self.termination_net.trainable_variables) self.q_optimizer.apply_gradients( zip(q_grads, self.q_net.trainable_variables)) self.intra_option_optimizer.apply_gradients( zip(intra_option_grads, self.actor_tv)) self.termination_optimizer.apply_gradients( zip(termination_grads, self.termination_net.trainable_variables)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/q_loss', tf.reduce_mean(q_loss)], ['LOSS/pi_loss', tf.reduce_mean(pi_loss)], ['LOSS/beta_loss', tf.reduce_mean(beta_loss)], ['Statistics/q_option_max', tf.reduce_max(q_s)], ['Statistics/q_option_min', tf.reduce_min(q_s)], ['Statistics/q_option_mean', tf.reduce_mean(q_s)]])
def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = squash_rsample( target_mu, target_log_std) else: target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical( target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) dc_r_q1 = tf.stop_gradient( r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient( r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) norm_dist = tfp.distributions.Normal( loc=mu, scale=tf.exp(log_std)) log_pi = tf.reduce_sum(norm_dist.log_prob( norm_dist.sample()), axis=-1) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) # $J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} | \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right.$ # \overline{\mathcal{H}} is negative alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2., summaries
def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q1, q2 = self.q_net(feat, a) v = self.v_net(feat) q1_pi, q2_pi = self.q_net(feat, pi) v_target = self.v_target_net(feat_) dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done)) v_from_q_stop = tf.stop_gradient( tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi) td_v = v - v_from_q_stop td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))], ['Statistics/v_mean', tf.reduce_mean(v)]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries
def share(self, BATCH, cell_state, kl_coef): last_options = tf.cast(BATCH.last_options, tf.int32) # [B,] options = tf.cast(BATCH.options, tf.int32) with tf.device(self.device): with tf.GradientTape() as tape: (q, pi, beta, o), cell_state = self.net( BATCH.obs, cell_state=cell_state) # [B, P], [B, P, A], [B, P], [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] options_onehot_expanded = tf.expand_dims(options_onehot, axis=-1) # [B, P, 1] last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] value = tf.reduce_sum(q * options_onehot, axis=1, keepdims=True) # [B, 1] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = pi # [B, A] new_log_prob = gaussian_likelihood_sum( BATCH.action, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = pi # [B, A] logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(BATCH.action * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - BATCH.log_prob) if self.kl_reverse: kl = tf.reduce_mean(new_log_prob - BATCH.log_prob) else: kl = tf.reduce_mean( BATCH.log_prob - new_log_prob ) # a sample estimate for KL-divergence, easy to compute surrogate = ratio * BATCH.gae_adv value_clip = BATCH.value + tf.clip_by_value( value - BATCH.value, -self.value_epsilon, self.value_epsilon) td_error = BATCH.discounted_reward - value td_error_clip = BATCH.discounted_reward - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) pi_loss = -tf.reduce_mean( tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * BATCH.gae_adv)) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square( tf.maximum(0., kl - self.kl_cutoff)) pi_loss = pi_loss + kl_loss + extra_loss q_loss = 0.5 * tf.reduce_mean(td_square) beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] beta_loss = tf.reduce_mean(beta_s * BATCH.beta_advantage) if self.terminal_mask: beta_loss *= (1 - done) o_log_prob = tf.reduce_sum(o * options_onehot, axis=-1, keepdims=True) # [B, 1] o_ratio = tf.exp(o_log_prob - BATCH.o_log_prob) o_entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(o) * o, axis=1, keepdims=True)) o_loss = -tf.reduce_mean( tf.minimum( o_ratio * BATCH.gae_adv, tf.clip_by_value(o_ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * BATCH.gae_adv)) loss = pi_loss + 1.0 * q_loss + o_loss + beta_loss - self.pi_beta * entropy - self.o_beta * o_entropy loss_grads = tape.gradient(loss, self.net_tv) self.optimizer.apply_gradients(zip(loss_grads, self.net_tv)) self.global_step.assign_add(1) return loss, pi_loss, q_loss, o_loss, beta_loss, entropy, o_entropy, kl
def train(self, memories, isw, cell_state): ss, vvss, a, r, done, s_, visual_s_ = memories with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: (feat, feat_), _ = self._representation_net(ss, vvss, cell_state=cell_state, need_split=True) if self.is_continuous: mu, log_std = self.actor_net.value_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample( mu, log_std, self.entropic_index) entropy = gaussian_entropy(log_std) target_mu, target_log_std = self.actor_net.value_net(feat_) target_log_std = clip_nn_log_std(target_log_std, self.log_std_min, self.log_std_max) target_pi, target_log_pi = tsallis_squash_rsample( target_mu, target_log_std, self.entropic_index) else: logits = self.actor_net.value_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample(a.shape), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) target_logits = self.actor_net.value_net(feat_) target_cate_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(target_logits)) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net.get_value(feat, a) q_s_pi = self.critic_net.get_min(feat, pi) q1_target, q2_target, _ = self.critic_target_net( s_, visual_s_, target_pi, cell_state=cell_state) q_target = tf.minimum(q1_target, q2_target) dc_r = tf.stop_gradient( r + self.gamma * (1 - done) * (q_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) critic_grads = tape.gradient(critic_loss, self.critic_net.trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_net.trainable_variables)) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries
def train_share(self, BATCH, cell_state, kl_coef): with tf.device(self.device): with tf.GradientTape() as tape: output, cell_state = self.net(BATCH.obs, cell_state=cell_state) if self.is_continuous: mu, log_std, value = output new_log_prob = gaussian_likelihood_sum( BATCH.action, mu, log_std) entropy = gaussian_entropy(log_std) else: logits, value = output logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(BATCH.action * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - BATCH.log_prob) surrogate = ratio * BATCH.gae_adv clipped_surrogate = tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * BATCH.gae_adv) # ref: https://github.com/thu-ml/tianshou/blob/c97aa4065ee8464bd5897bb86f1f81abd8e2cff9/tianshou/policy/modelfree/ppo.py#L159 if self.use_duel_clip: clipped_surrogate = tf.maximum(clipped_surrogate, (1.0 + self.duel_epsilon) * BATCH.gae_adv) actor_loss = -(tf.reduce_mean(clipped_surrogate) + self.ent_coef * entropy) # ref: https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40 # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L185 if self.kl_reverse: kl = .5 * tf.reduce_mean( tf.square(new_log_prob - BATCH.log_prob)) else: kl = .5 * tf.reduce_mean( tf.square(BATCH.log_prob - new_log_prob) ) # a sample estimate for KL-divergence, easy to compute td_error = BATCH.discounted_reward - value if self.use_vclip: # ref: https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154 # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L172 value_clip = BATCH.value + tf.clip_by_value( value - BATCH.value, -self.value_epsilon, self.value_epsilon) td_error_clip = BATCH.discounted_reward - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) else: td_square = tf.square(td_error) if self.use_kl_loss: kl_loss = kl_coef * kl actor_loss += kl_loss if self.use_extra_loss: extra_loss = self.extra_coef * tf.square( tf.maximum(0., kl - self.kl_cutoff)) actor_loss += extra_loss value_loss = 0.5 * tf.reduce_mean(td_square) loss = actor_loss + self.vf_coef * value_loss loss_grads = tape.gradient(loss, self.net.trainable_variables) self.optimizer.apply_gradients( zip(loss_grads, self.net.trainable_variables)) self.global_step.assign_add(1) return actor_loss, value_loss, entropy, kl
def train(self, BATCH, isw, cell_state, visual, visual_, pos): with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: vis_feat = self.encoder(visual) vis_feat_ = self.encoder(visual_) target_vis_feat_ = self.encoder_target(visual_) feat = tf.concat( [vis_feat, BATCH.obs.flatten_vector()], axis=-1) feat_ = tf.concat( [vis_feat_, BATCH.obs_.flatten_vector()], axis=-1) target_feat_ = tf.concat( [target_vis_feat_, BATCH.obs_.flatten_vector()], axis=-1) if self.is_continuous: target_mu, target_log_std = self.actor_net.value_net(feat_) target_pi, target_log_pi = squash_rsample( target_mu, target_log_std) else: target_logits = self.actor_net.value_net(feat_) target_cate_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(target_logits)) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net.value_net(feat, BATCH.action) q1_target, q2_target = self.critic_target_net.value_net( feat_, target_pi) q_target = tf.minimum(q1_target, q2_target) dc_r = tf.stop_gradient( BATCH.reward + self.gamma * (1 - BATCH.done) * (q_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss z_a = vis_feat # [B, N] z_out = self.encoder_target(pos) logits = tf.matmul( z_a, tf.matmul(self.curl_w, tf.transpose(z_out, [1, 0]))) logits -= tf.reduce_max(logits, axis=-1, keepdims=True) curl_loss = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( tf.range(self.batch_size), logits)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) curl_grads = tape.gradient(curl_loss, [self.curl_w] + self.encoder.trainable_variables) self.optimizer_curl.apply_gradients( zip(curl_grads, [self.curl_w] + self.encoder.trainable_variables)) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net.value_net(feat) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net.value_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( BATCH.action.shape), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net.value_net(feat) norm_dist = tfp.distributions.Normal( loc=mu, scale=tf.exp(log_std)) log_pi = tf.reduce_sum(norm_dist.log_prob( norm_dist.sample()), axis=-1, keep_dims=True) # [B, 1] else: logits = self.actor_net.value_net(feat) norm_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(logits)) log_pi = norm_dist.log_prob(cate_dist.sample()) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['LOSS/curl_loss', curl_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2., summaries
def train(self, memories, kl_coef): s, visual_s, a, dc_r, old_log_prob, advantage, old_value, beta_advantage, last_options, options, cell_state = memories last_options = tf.reshape(tf.cast(last_options, tf.int32), (-1, )) # [B, 1] => [B,] options = tf.reshape(tf.cast(options, tf.int32), (-1, )) with tf.device(self.device): with tf.GradientTape() as tape: (q, pi, beta), cell_state = self.net( s, visual_s, cell_state=cell_state) # [B, P], [B, P, A], [B, P], [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] options_onehot_expanded = tf.expand_dims(options_onehot, axis=-1) # [B, P, 1] last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] value = tf.reduce_sum(q * options_onehot, axis=1, keepdims=True) # [B, 1] if self.is_continuous: mu = pi # [B, A] log_std = tf.gather(self.log_std, options) new_log_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = pi # [B, A] logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) if self.kl_reverse: kl = tf.reduce_mean(new_log_prob - old_log_prob) else: kl = tf.reduce_mean( old_log_prob - new_log_prob ) # a sample estimate for KL-divergence, easy to compute surrogate = ratio * advantage value_clip = old_value + tf.clip_by_value( value - old_value, -self.value_epsilon, self.value_epsilon) td_error = dc_r - value td_error_clip = dc_r - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) pi_loss = -tf.reduce_mean( tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * advantage)) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square( tf.maximum(0., kl - self.kl_cutoff)) pi_loss = pi_loss + kl_loss + extra_loss q_loss = 0.5 * tf.reduce_mean(td_square) beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] beta_loss = tf.reduce_mean(beta_s * beta_advantage) if self.terminal_mask: beta_loss *= (1 - done) loss = pi_loss + 1.0 * q_loss + beta_loss - self.pi_beta * entropy loss_grads = tape.gradient(loss, self.net_tv) self.optimizer.apply_gradients(zip(loss_grads, self.net_tv)) self.global_step.assign_add(1) return loss, pi_loss, q_loss, beta_loss, entropy, kl
def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done, old_log_prob = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: next_mu = self.actor_net(feat_) max_q_next = tf.stop_gradient( self.critic_net(feat_, next_mu)) else: logits = self.actor_net(feat_) max_a = tf.argmax(logits, axis=1) max_a_one_hot = tf.one_hot(max_a, self.a_dim, dtype=tf.float32) max_q_next = tf.stop_gradient( self.critic_net(feat_, max_a_one_hot)) q = self.critic_net(feat, a) td_error = q - (r + self.gamma * (1 - done) * max_q_next) critic_loss = tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) log_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q = self.critic_net(feat, a) ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob)) q_value = tf.stop_gradient(q) actor_loss = -tf.reduce_mean(ratio * log_prob * q_value) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_max', tf.reduce_max(q)], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/ratio', tf.reduce_mean(ratio)], ['Statistics/entropy', entropy]])