コード例 #1
0
 def test_get_target_ops(self):
     var = tf.compat.v1.get_variable('var', [1],
                                     initializer=tf.constant_initializer(1))
     target_var = tf.compat.v1.get_variable(
         'target_var', [1], initializer=tf.constant_initializer(2))
     self.sess.run(tf.compat.v1.global_variables_initializer())
     assert target_var.eval() == 2
     update_ops = get_target_ops([var], [target_var])
     self.sess.run(update_ops)
     assert target_var.eval() == 1
コード例 #2
0
 def test_get_target_ops_tau(self):
     var = tf.compat.v1.get_variable('var', [1],
                                     initializer=tf.constant_initializer(1))
     target_var = tf.compat.v1.get_variable(
         'target_var', [1], initializer=tf.constant_initializer(2))
     self.sess.run(tf.compat.v1.global_variables_initializer())
     assert target_var.eval() == 2
     init_ops, update_ops = get_target_ops([var], [target_var], tau=0.2)
     self.sess.run(update_ops)
     assert np.allclose(target_var.eval(), 1.8)
     self.sess.run(init_ops)
     assert np.allclose(target_var.eval(), 1)
コード例 #3
0
    def _init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self._name):
            # Create target policy (actor) and qf (critic) networks
            with tf.name_scope('inputs'):
                obs_dim = self._env_spec.observation_space.flat_dim
                y = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self._env_spec.action_space.flat_dim),
                    name='input_action')

            policy_network_outputs = self._target_policy.build(obs,
                                                               name='policy')
            target_qf_outputs = self._target_qf.build(obs, actions, name='qf')
            target_qf2_outputs = self._target_qf2.build(obs,
                                                        actions,
                                                        name='qf')

            self._target_policy_f_prob_online = compile_function(
                inputs=[obs], outputs=policy_network_outputs)

            self._target_qf_f_prob_online = compile_function(
                inputs=[obs, actions], outputs=target_qf_outputs)

            self._target_qf2_f_prob_online = compile_function(
                inputs=[obs, actions], outputs=target_qf2_outputs)

            # Set up target init and update functions
            with tf.name_scope('setup_target'):
                policy_init_op, policy_update_op = get_target_ops(
                    self.policy.get_global_vars(),
                    self._target_policy.get_global_vars(), self._tau)
                qf_init_ops, qf_update_ops = get_target_ops(
                    self.qf.get_global_vars(),
                    self._target_qf.get_global_vars(), self._tau)
                qf2_init_ops, qf2_update_ops = get_target_ops(
                    self.qf2.get_global_vars(),
                    self._target_qf2.get_global_vars(), self._tau)
                target_init_op = policy_init_op + qf_init_ops + qf2_init_ops
                target_update_op = (policy_update_op + qf_update_ops +
                                    qf2_update_ops)

            f_init_target = compile_function(inputs=[], outputs=target_init_op)
            f_update_target = compile_function(inputs=[],
                                               outputs=target_update_op)

            # Set up policy training function
            next_action = self.policy.build(obs, name='policy_action')
            next_qval = self.qf.build(obs,
                                      next_action,
                                      name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)

            with tf.name_scope('minimize_action_loss'):
                policy_optimizer = make_optimizer(
                    self._policy_optimizer,
                    learning_rate=self._policy_lr,
                    name='PolicyOptimizer')
                policy_train_op = policy_optimizer.minimize(
                    action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self.qf.build(obs, actions, name='q_value')
            q2val = self.qf2.build(obs, actions, name='q2_value')
            with tf.name_scope('qval1_loss'):
                qval1_loss = tf.reduce_mean(tf.math.squared_difference(
                    y, qval))
            with tf.name_scope('qval2_loss'):
                qval2_loss = tf.reduce_mean(
                    tf.math.squared_difference(y, q2val))

            with tf.name_scope('minimize_qf_loss'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr,
                                              name='QFunctionOptimizer')
                qf_train_op = qf_optimizer.minimize(
                    qval1_loss, var_list=self.qf.get_trainable_vars())
                qf2_train_op = qf_optimizer.minimize(
                    qval2_loss, var_list=self.qf2.get_trainable_vars())

            f_train_qf = compile_function(
                inputs=[y, obs, actions],
                outputs=[qf_train_op, qval1_loss, qval])
            f_train_qf2 = compile_function(
                inputs=[y, obs, actions],
                outputs=[qf2_train_op, qval2_loss, q2val])

            self._f_train_policy = f_train_policy
            self._f_train_qf = f_train_qf
            self._f_init_target = f_init_target
            self._f_update_target = f_update_target
            self._f_train_qf2 = f_train_qf2
コード例 #4
0
ファイル: ddpg.py プロジェクト: ziyiwu9494/garage
    def _init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self._name):
            # Create target policy and qf network
            with tf.name_scope('inputs'):
                obs_dim = self._env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self._env_spec.action_space.flat_dim),
                    name='input_action')

            policy_network_outputs = self._target_policy.build(obs,
                                                               name='policy')
            target_qf_outputs = self._target_qf.build(obs, actions, name='qf')

            self._target_policy_f_prob_online = compile_function(
                inputs=[obs], outputs=policy_network_outputs)
            self._target_qf_f_prob_online = compile_function(
                inputs=[obs, actions], outputs=target_qf_outputs)

            # Set up target init and update function
            with tf.name_scope('setup_target'):
                ops = get_target_ops(self.policy.get_global_vars(),
                                     self._target_policy.get_global_vars(),
                                     self._tau)
                policy_init_ops, policy_update_ops = ops
                qf_init_ops, qf_update_ops = get_target_ops(
                    self._qf.get_global_vars(),
                    self._target_qf.get_global_vars(), self._tau)
                target_init_op = policy_init_ops + qf_init_ops
                target_update_op = policy_update_ops + qf_update_ops

            f_init_target = compile_function(inputs=[], outputs=target_init_op)
            f_update_target = compile_function(inputs=[],
                                               outputs=target_update_op)

            with tf.name_scope('inputs'):
                obs_dim = self._env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self._env_spec.action_space.flat_dim),
                    name='input_action')
            # Set up policy training function
            next_action = self.policy.build(obs, name='policy_action')
            next_qval = self._qf.build(obs,
                                       next_action,
                                       name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)
                if self._policy_weight_decay > 0.:
                    regularizer = tf.keras.regularizers.l2(
                        self._policy_weight_decay)
                    for var in self.policy.get_regularizable_vars():
                        policy_reg = regularizer(var)
                        action_loss += policy_reg

            with tf.name_scope('minimize_action_loss'):
                policy_optimizer = make_optimizer(
                    self._policy_optimizer,
                    learning_rate=self._policy_lr,
                    name='PolicyOptimizer')
                policy_train_op = policy_optimizer.minimize(
                    action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self._qf.build(obs, actions, name='q_value')
            with tf.name_scope('qval_loss'):
                qval_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(input_y, qval))
                if self._qf_weight_decay > 0.:
                    regularizer = tf.keras.regularizers.l2(
                        self._qf_weight_decay)
                    for var in self._qf.get_regularizable_vars():
                        qf_reg = regularizer(var)
                        qval_loss += qf_reg

            with tf.name_scope('minimize_qf_loss'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr,
                                              name='QFunctionOptimizer')
                qf_train_op = qf_optimizer.minimize(
                    qval_loss, var_list=self._qf.get_trainable_vars())

            f_train_qf = compile_function(
                inputs=[input_y, obs, actions],
                outputs=[qf_train_op, qval_loss, qval])

            self._f_train_policy = f_train_policy
            self._f_train_qf = f_train_qf
            self._f_init_target = f_init_target
            self._f_update_target = f_update_target
コード例 #5
0
    def _init_opt(self):
        """Initialize the networks and Ops.

        Assume discrete space for dqn, so action dimension
        will always be action_space.n
        """
        action_dim = self._env_spec.action_space.n

        # build q networks
        with tf.name_scope(self._name):
            action_t_ph = tf.compat.v1.placeholder(tf.int32,
                                                   None,
                                                   name='action')
            reward_t_ph = tf.compat.v1.placeholder(tf.float32,
                                                   None,
                                                   name='reward')
            done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done')

            with tf.name_scope('update_ops'):
                target_update_op = get_target_ops(
                    self._qf.get_global_vars(),
                    self._target_qf.get_global_vars())

            self._qf_update_ops = compile_function(inputs=[],
                                                   outputs=target_update_op)

            with tf.name_scope('td_error'):
                # Q-value of the selected action
                action = tf.one_hot(action_t_ph,
                                    action_dim,
                                    on_value=1.,
                                    off_value=0.)
                q_selected = tf.reduce_sum(
                    self._qf.q_vals * action,  # yapf: disable
                    axis=1)

                # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a)
                if self._double_q:
                    target_qval_with_online_q = self._qf.build(
                        self._target_qf.input, self._qf.name)
                    future_best_q_val_action = tf.argmax(
                        target_qval_with_online_q, 1)
                    future_best_q_val = tf.reduce_sum(
                        self._target_qf.q_vals *
                        tf.one_hot(future_best_q_val_action,
                                   action_dim,
                                   on_value=1.,
                                   off_value=0.),
                        axis=1)
                else:
                    # r + max_a(Q'(s', _)) - Q(s, a)
                    future_best_q_val = tf.reduce_max(self._target_qf.q_vals,
                                                      axis=1)

                q_best_masked = (1.0 - done_t_ph) * future_best_q_val
                # if done, it's just reward
                # else reward + discount * future_best_q_val
                target_q_values = (reward_t_ph +
                                   self._discount * q_best_masked)

                # td_error = q_selected - tf.stop_gradient(target_q_values)
                loss = tf.compat.v1.losses.huber_loss(
                    q_selected, tf.stop_gradient(target_q_values))
                loss = tf.reduce_mean(loss)

            with tf.name_scope('optimize_ops'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr)
                if self._grad_norm_clipping is not None:
                    gradients = qf_optimizer.compute_gradients(
                        loss, var_list=self._qf.get_trainable_vars())
                    for i, (grad, var) in enumerate(gradients):
                        if grad is not None:
                            gradients[i] = (tf.clip_by_norm(
                                grad, self._grad_norm_clipping), var)
                        optimize_loss = qf_optimizer.apply_gradients(gradients)
                else:
                    optimize_loss = qf_optimizer.minimize(
                        loss, var_list=self._qf.get_trainable_vars())

            self._train_qf = compile_function(inputs=[
                self._qf.input, action_t_ph, reward_t_ph, done_t_ph,
                self._target_qf.input
            ],
                                              outputs=[loss, optimize_loss])