Beispiel #1
0
    def _setup_deterministic_optimizer(self, action, scope=None):
        """Create the loss and optimizer of a deterministic policy."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up optimizer')
            print_params_shape(scope_name, "policy")

        # Choose the loss function.
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        # Define the loss function.
        self.loss = loss_fn(action, self.policy)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        # Create the optimizer operation.
        self.optimizer = optimizer.minimize(
            loss=self.loss, var_list=get_trainable_vars(scope_name))
Beispiel #2
0
    def _setup_critic_optimizer(self, scope):
        """Create minimization operation for critic Q-function.

        Create a `tf.optimizer.minimize` operation for updating critic
        Q-function with gradient descent.

        See Equations (5, 6) in [1], for further information of the Q-function
        update rule.
        """
        scope_name = 'model/value_fns'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up critic optimizer')
            for name in ['qf1', 'qf2', 'vf']:
                scope_i = '{}/{}'.format(scope_name, name)
                print_params_shape(scope_i, name)

        # Take the min of the two Q-Values (Double-Q Learning)
        min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi)

        # Target for Q value regression
        q_backup = tf.stop_gradient(
            self.rew_ph +
            (1 - self.terminals1) * self.gamma * self.value_target)

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        # Compute Q-Function loss
        qf1_loss = loss_fn(q_backup, self.qf1)
        qf2_loss = loss_fn(q_backup, self.qf2)

        # Target for value fn regression
        # We update the vf towards the min of two Q-functions in order to
        # reduce overestimation bias from function approximation error.
        v_backup = tf.stop_gradient(min_qf_pi - self.alpha * self.logp_pi)
        value_loss = loss_fn(self.value_fn, v_backup)

        self.critic_loss = (qf1_loss, qf2_loss, value_loss)

        # Combine the loss functions for the optimizer.
        critic_loss = qf1_loss + qf2_loss + value_loss

        # Critic train op
        critic_optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)
        self.critic_optimizer = critic_optimizer.minimize(
            critic_loss,
            var_list=get_trainable_vars(scope_name))
Beispiel #3
0
    def _setup_actor_optimizer(self, scope):
        """Create the actor loss, gradient, and optimizer."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape(scope_name, "actor")

        # compute the actor loss
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0])

        # create an optimizer object
        optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))
Beispiel #4
0
    def _setup_stochastic_optimizer(self, scope):
        """Create the loss and optimizer of a stochastic policy."""
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up optimizer')
            print_params_shape(scope_name, "policy")

        # Define the loss function.
        self.loss = -tf.reduce_mean(self.logp_ac)

        # Create an optimizer object.
        optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        # Create the optimizer operation.
        self.optimizer = optimizer.minimize(
            loss=self.loss, var_list=get_trainable_vars(scope_name))
Beispiel #5
0
    def _setup_actor_optimizer(self, scope):
        """Create minimization operations for policy and entropy.

        Creates a `tf.optimizer.minimize` operations for updating policy and
        entropy with gradient descent.

        See Section 4.2 in [1], for further information of the policy update,
        and Section 5 in [1] for further information of the entropy update.
        """
        scope_name = 'model/pi/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor and alpha optimizers')
            print_params_shape(scope_name, "actor")

        # Take the min of the two Q-Values (Double-Q Learning)
        min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi)

        # Compute the entropy temperature loss.
        self.alpha_loss = -tf.reduce_mean(
            self.log_alpha *
            tf.stop_gradient(self.logp_pi + self.target_entropy))

        alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.alpha_optimizer = alpha_optimizer.minimize(
            self.alpha_loss, var_list=self.log_alpha)

        # Compute the policy loss
        self.actor_loss = tf.reduce_mean(self.alpha * self.logp_pi - min_qf_pi)

        # Add a regularization penalty.
        self.actor_loss += self._l2_loss(self.l2_penalty, scope_name)

        # Policy train op (has to be separate from value train op, because
        # min_qf_pi appears in policy_loss)
        actor_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr)

        self.actor_optimizer = actor_optimizer.minimize(
            self.actor_loss, var_list=get_trainable_vars(scope_name))
Beispiel #6
0
    def _setup_critic_optimizer(self, critic_target, scope):
        """Create the critic loss, gradient, and optimizer."""
        if self.verbose >= 2:
            print('setting up critic optimizer')

        # compute the target critic term
        with tf.compat.v1.variable_scope("loss", reuse=False):
            q_obs1 = tf.minimum(critic_target[0], critic_target[1])
            target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) *
                                        self.gamma * q_obs1)

            tf.compat.v1.summary.scalar('critic_target',
                                        tf.reduce_mean(target_q))

        # choose the loss function
        if self.use_huber:
            loss_fn = tf.compat.v1.losses.huber_loss
        else:
            loss_fn = tf.compat.v1.losses.mean_squared_error

        self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf]

        self.critic_optimizer = []

        for i, critic_loss in enumerate(self.critic_loss):
            scope_name = 'model/qf_{}/'.format(i)
            if scope is not None:
                scope_name = scope + '/' + scope_name

            if self.verbose >= 2:
                print_params_shape(scope_name, "critic {}".format(i))

            # create an optimizer object
            optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr)

            # create the optimizer object
            self.critic_optimizer.append(
                optimizer.minimize(loss=critic_loss,
                                   var_list=get_trainable_vars(scope_name)))
Beispiel #7
0
    def _setup_optimizers(self, scope):
        """Create the actor and critic optimizers."""
        scope_name = 'model/'
        old_scope_name = "oldpi/"
        if scope is not None:
            scope_name = scope + '/' + scope_name
            old_scope_name = scope + '/' + old_scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape("{}pi/".format(scope_name), "actor")
            print('setting up critic optimizer')
            print_params_shape("{}vf/".format(scope_name), "critic")

        # =================================================================== #
        # Create the policy loss and optimizers.                              #
        # =================================================================== #

        with tf.variable_scope("loss", reuse=False):
            # Compute the KL divergence.
            kloldnew = tf.reduce_sum(
                self.pi_logstd - self.old_pi_logstd +
                (tf.square(self.old_pi_std) +
                 tf.square(self.old_pi_mean - self.pi_mean)) /
                (2.0 * tf.square(self.pi_std)) - 0.5,
                axis=-1)
            meankl = tf.reduce_mean(kloldnew)

            # Compute the entropy bonus.
            entropy = tf.reduce_sum(self.pi_logstd +
                                    .5 * np.log(2.0 * np.pi * np.e),
                                    axis=-1)
            meanent = tf.reduce_mean(entropy)
            entbonus = self.ent_coef * meanent

            # advantage * pnew / pold
            ratio = tf.exp(
                self.logp(self.action_ph, old=False) -
                self.logp(self.action_ph, old=True))
            surrgain = tf.reduce_mean(ratio * self.advs_ph)

            optimgain = surrgain + entbonus
            self.losses = [optimgain, meankl, entbonus, surrgain, meanent]

            all_var_list = get_trainable_vars(scope_name)
            var_list = [
                v for v in all_var_list
                if "/vf" not in v.name and "/q/" not in v.name
            ]
            vf_var_list = [
                v for v in all_var_list
                if "/pi" not in v.name and "/logstd" not in v.name
            ]

            self.get_flat = GetFlat(var_list, sess=self.sess)
            self.set_from_flat = SetFromFlat(var_list, sess=self.sess)

            klgrads = tf.gradients(meankl, var_list)
            shapes = [var.get_shape().as_list() for var in var_list]
            start = 0
            tangents = []
            for shape in shapes:
                var_size = int(np.prod(shape))
                tangents.append(
                    tf.reshape(self.flat_tangent[start:start + var_size],
                               shape))
                start += var_size
            gvp = tf.add_n([
                tf.reduce_sum(grad * tangent)
                for (grad, tangent) in zip(klgrads, tangents)
            ])
            # Fisher vector products
            self.fvp = flatgrad(gvp, var_list)

        # =================================================================== #
        # Update the old model to match the new one.                          #
        # =================================================================== #

        self.assign_old_eq_new = tf.group(*[
            tf.assign(oldv, newv) for (oldv, newv) in zip(
                get_globals_vars(old_scope_name), get_globals_vars(scope_name))
        ])

        # =================================================================== #
        # Create the value function optimizer.                                #
        # =================================================================== #

        vferr = tf.reduce_mean(tf.square(self.value_flat - self.ret_ph))
        optimizer = tf.compat.v1.train.AdamOptimizer(self.vf_stepsize)
        self.vf_optimizer = optimizer.minimize(
            vferr,
            var_list=vf_var_list,
        )

        # Initialize the model parameters and optimizers.
        with self.sess.as_default():
            self.sess.run(tf.compat.v1.global_variables_initializer())

        th_init = self.get_flat()
        self.set_from_flat(th_init)

        self.grad = flatgrad(optimgain, var_list)
Beispiel #8
0
    def _setup_optimizers(self, scope):
        """Create the actor and critic optimizers."""
        scope_name = 'model/'
        if scope is not None:
            scope_name = scope + '/' + scope_name

        if self.verbose >= 2:
            print('setting up actor optimizer')
            print_params_shape("{}pi/".format(scope_name), "actor")
            print('setting up critic optimizer')
            print_params_shape("{}vf/".format(scope_name), "critic")

        neglogpac = self._neglogp(self.action_ph)
        self.entropy = tf.reduce_sum(tf.reshape(self.pi_logstd, [-1]) +
                                     .5 * np.log(2.0 * np.pi * np.e),
                                     axis=-1)

        # Value function clipping: not present in the original PPO
        if self.cliprange_vf is None:
            # Default behavior (legacy from OpenAI baselines):
            # use the same clipping as for the policy
            self.cliprange_vf = self.cliprange

        if self.cliprange_vf < 0:
            # Original PPO implementation: no value function clipping.
            vpred_clipped = self.value_flat
        else:
            # Clip the different between old and new value
            # NOTE: this depends on the reward scaling
            vpred_clipped = self.old_vpred_ph + tf.clip_by_value(
                self.value_flat - self.old_vpred_ph, -self.cliprange_vf,
                self.cliprange_vf)

        vf_losses1 = tf.square(self.value_flat - self.rew_ph)
        vf_losses2 = tf.square(vpred_clipped - self.rew_ph)
        self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
        pg_losses = -self.advs_ph * ratio
        pg_losses2 = -self.advs_ph * tf.clip_by_value(
            ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
        self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        self.approxkl = .5 * tf.reduce_mean(
            tf.square(neglogpac - self.old_neglog_pac_ph))
        self.clipfrac = tf.reduce_mean(
            tf.cast(tf.greater(tf.abs(ratio - 1.0), self.cliprange),
                    tf.float32))
        self.loss = self.pg_loss - self.entropy * self.ent_coef \
            + self.vf_loss * self.vf_coef

        # Compute the gradients of the loss.
        var_list = get_trainable_vars(scope_name)
        grads = tf.gradients(self.loss, var_list)

        # Perform gradient clipping if requested.
        if self.max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads,
                                                       self.max_grad_norm)
        grads = list(zip(grads, var_list))

        # Create the operation that applies the gradients.
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate,
            epsilon=1e-5).apply_gradients(grads)