def __init__(self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask, twin_q_t, twin_q_tp1, actor_loss_coeff=0.1, critic_loss_coeff=1.0, gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0, twin_q=False, policy_delay=1): q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked # compute the error (potentially clipped) if twin_q: td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) twin_td_error = twin_q_t_selected - tf.stop_gradient( q_t_selected_target) self.td_error = td_error + twin_td_error if use_huber: errors = _huber_loss(td_error, huber_threshold) + _huber_loss( twin_td_error, huber_threshold) else: errors = 0.5 * tf.square(td_error) + 0.5 * tf.square( twin_td_error) else: self.td_error = ( q_t_selected - tf.stop_gradient(q_t_selected_target)) if use_huber: errors = _huber_loss(self.td_error, huber_threshold) else: errors = 0.5 * tf.square(self.td_error) self.critic_loss = critic_loss_coeff * tf.reduce_mean( importance_weights * errors) # for policy gradient, update policy net one time v.s. # update critic net `policy_delay` time(s) global_step = tf.train.get_or_create_global_step() policy_delay_mask = tf.to_float( tf.equal(tf.mod(global_step, policy_delay), 0)) self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask * tf.reduce_mean(q_tp0))
def __init__(self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask, twin_q_t, twin_q_tp1, actor_loss_coeff=0.1, critic_loss_coeff=1.0, gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0, twin_q=False, policy_delay=1): q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked # compute the error (potentially clipped) if twin_q: td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) twin_td_error = twin_q_t_selected - tf.stop_gradient( q_t_selected_target) self.td_error = td_error + twin_td_error if use_huber: errors = _huber_loss(td_error, huber_threshold) + _huber_loss( twin_td_error, huber_threshold) else: errors = 0.5 * tf.square(td_error) + 0.5 * tf.square( twin_td_error) else: self.td_error = (q_t_selected - tf.stop_gradient(q_t_selected_target)) if use_huber: errors = _huber_loss(self.td_error, huber_threshold) else: errors = 0.5 * tf.square(self.td_error) self.critic_loss = critic_loss_coeff * tf.reduce_mean( importance_weights * errors) # for policy gradient, update policy net one time v.s. # update critic net `policy_delay` time(s) global_step = tf.train.get_or_create_global_step() policy_delay_mask = tf.to_float( tf.equal(tf.mod(global_step, policy_delay), 0)) self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask * tf.reduce_mean(q_tp0))
def __init__(self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask, gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0): q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked # compute the error (potentially clipped) self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) if use_huber: errors = _huber_loss(self.td_error, huber_threshold) else: errors = 0.5 * tf.square(self.td_error) self.critic_loss = tf.reduce_mean(importance_weights * errors) # for policy gradient self.actor_loss = -1.0 * tf.reduce_mean(q_tp0) self.total_loss = self.actor_loss + self.critic_loss
def _build_actor_critic_loss(self, q_t, q_tp1, q_t_det_policy, twin_q_t=None, twin_q_tp1=None): twin_q = self.config["twin_q"] gamma = self.config["gamma"] n_step = self.config["n_step"] use_huber = self.config["use_huber"] huber_threshold = self.config["huber_threshold"] q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = tf.stop_gradient(self.rew_t + gamma**n_step * q_tp1_best_masked) # compute the error (potentially clipped) if twin_q: td_error = q_t_selected - q_t_selected_target twin_td_error = twin_q_t_selected - q_t_selected_target td_error = td_error + twin_td_error if use_huber: errors = _huber_loss(td_error, huber_threshold) \ + _huber_loss(twin_td_error, huber_threshold) else: errors = 0.5 * tf.square(td_error) + 0.5 * tf.square( twin_td_error) else: td_error = q_t_selected - q_t_selected_target if use_huber: errors = _huber_loss(td_error, huber_threshold) else: errors = 0.5 * tf.square(td_error) critic_loss = tf.reduce_mean(self.importance_weights * errors) actor_loss = -tf.reduce_mean(q_t_det_policy) return critic_loss, actor_loss, td_error