Esempio n. 1
0
    def __init__(self,
                 q_t,
                 q_tp1,
                 q_tp0,
                 importance_weights,
                 rewards,
                 done_mask,
                 twin_q_t,
                 twin_q_tp1,
                 actor_loss_coeff=0.1,
                 critic_loss_coeff=1.0,
                 gamma=0.99,
                 n_step=1,
                 use_huber=False,
                 huber_threshold=1.0,
                 twin_q=False,
                 policy_delay=1):

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
        if twin_q:
            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)

        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked

        # compute the error (potentially clipped)
        if twin_q:
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            twin_td_error = twin_q_t_selected - tf.stop_gradient(
                q_t_selected_target)
            self.td_error = td_error + twin_td_error
            if use_huber:
                errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
                    twin_td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
                    twin_td_error)
        else:
            self.td_error = (
                q_t_selected - tf.stop_gradient(q_t_selected_target))
            if use_huber:
                errors = _huber_loss(self.td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(self.td_error)

        self.critic_loss = critic_loss_coeff * tf.reduce_mean(
            importance_weights * errors)

        # for policy gradient, update policy net one time v.s.
        # update critic net `policy_delay` time(s)
        global_step = tf.train.get_or_create_global_step()
        policy_delay_mask = tf.to_float(
            tf.equal(tf.mod(global_step, policy_delay), 0))
        self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
                           tf.reduce_mean(q_tp0))
Esempio n. 2
0
    def __init__(self,
                 q_t,
                 q_tp1,
                 q_tp0,
                 importance_weights,
                 rewards,
                 done_mask,
                 twin_q_t,
                 twin_q_tp1,
                 actor_loss_coeff=0.1,
                 critic_loss_coeff=1.0,
                 gamma=0.99,
                 n_step=1,
                 use_huber=False,
                 huber_threshold=1.0,
                 twin_q=False,
                 policy_delay=1):

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
        if twin_q:
            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)

        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked

        # compute the error (potentially clipped)
        if twin_q:
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            twin_td_error = twin_q_t_selected - tf.stop_gradient(
                q_t_selected_target)
            self.td_error = td_error + twin_td_error
            if use_huber:
                errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
                    twin_td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
                    twin_td_error)
        else:
            self.td_error = (q_t_selected -
                             tf.stop_gradient(q_t_selected_target))
            if use_huber:
                errors = _huber_loss(self.td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(self.td_error)

        self.critic_loss = critic_loss_coeff * tf.reduce_mean(
            importance_weights * errors)

        # for policy gradient, update policy net one time v.s.
        # update critic net `policy_delay` time(s)
        global_step = tf.train.get_or_create_global_step()
        policy_delay_mask = tf.to_float(
            tf.equal(tf.mod(global_step, policy_delay), 0))
        self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
                           tf.reduce_mean(q_tp0))
Esempio n. 3
0
    def __init__(self,
                 q_t,
                 q_tp1,
                 q_tp0,
                 importance_weights,
                 rewards,
                 done_mask,
                 gamma=0.99,
                 n_step=1,
                 use_huber=False,
                 huber_threshold=1.0):

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)

        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked

        # compute the error (potentially clipped)
        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        if use_huber:
            errors = _huber_loss(self.td_error, huber_threshold)
        else:
            errors = 0.5 * tf.square(self.td_error)

        self.critic_loss = tf.reduce_mean(importance_weights * errors)

        # for policy gradient
        self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
        self.total_loss = self.actor_loss + self.critic_loss
Esempio n. 4
0
    def _build_actor_critic_loss(self,
                                 q_t,
                                 q_tp1,
                                 q_t_det_policy,
                                 twin_q_t=None,
                                 twin_q_tp1=None):
        twin_q = self.config["twin_q"]
        gamma = self.config["gamma"]
        n_step = self.config["n_step"]
        use_huber = self.config["use_huber"]
        huber_threshold = self.config["huber_threshold"]

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
        if twin_q:
            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)

        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = tf.stop_gradient(self.rew_t + gamma**n_step *
                                               q_tp1_best_masked)

        # compute the error (potentially clipped)
        if twin_q:
            td_error = q_t_selected - q_t_selected_target
            twin_td_error = twin_q_t_selected - q_t_selected_target
            td_error = td_error + twin_td_error
            if use_huber:
                errors = _huber_loss(td_error, huber_threshold) \
                    + _huber_loss(twin_td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
                    twin_td_error)
        else:
            td_error = q_t_selected - q_t_selected_target
            if use_huber:
                errors = _huber_loss(td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(td_error)

        critic_loss = tf.reduce_mean(self.importance_weights * errors)
        actor_loss = -tf.reduce_mean(q_t_det_policy)
        return critic_loss, actor_loss, td_error