Example #1
0
    def test_success(self):
        dim1 = np.random.randint(10) + 1
        dim2 = np.random.randint(10) + 1
        var1 = tf.Variable(np.random.random((dim1, dim2)), name='var1')
        var2 = tf.Variable(np.random.random((dim1, dim2)), name='var2')

        ops = build_optim(var1, 1e-4, 'var1')

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            before_var1, before_var2 = sess.run([var1, var2])
            sess.run(ops)
            after_var1, after_var2 = sess.run([var1, var2])

            assert_variable_mismatch(before_var1, after_var1)
            assert_variable_match(before_var2, after_var2)
Example #2
0
    def _build(self, params):
        with tf.variable_scope('sac'):
            self.obs_t_ph = tf.placeholder(tf.float32,
                                           (None, ) + params.state_shape,
                                           name='obs_t')
            self.actions_t_ph = tf.placeholder(tf.float32,
                                               (None, params.num_actions),
                                               name='actions_t')
            self.rewards_tp1_ph = tf.placeholder(tf.float32, (None, ),
                                                 name='rewards_tp1')
            self.obs_tp1_ph = tf.placeholder(tf.float32,
                                             (None, ) + params.state_shape,
                                             name='obs_tp1')
            self.dones_tp1_ph = tf.placeholder(tf.float32, (None, ),
                                               name='dones_tp1')

            # policy function
            pi_t = stochastic_policy_function(params.fcs,
                                              self.obs_t_ph,
                                              params.num_actions,
                                              tf.nn.relu,
                                              share=True,
                                              w_init=XAVIER_INIT,
                                              last_w_init=XAVIER_INIT,
                                              last_b_init=XAVIER_INIT,
                                              scope='pi')
            squashed_action_t, log_prob_t = squash_action(pi_t)

            # value function
            v_t = value_function(params.fcs,
                                 self.obs_t_ph,
                                 tf.nn.relu,
                                 XAVIER_INIT,
                                 XAVIER_INIT,
                                 ZEROS_INIT,
                                 scope='v')
            # target value function
            v_tp1 = value_function(params.fcs,
                                   self.obs_tp1_ph,
                                   tf.nn.relu,
                                   XAVIER_INIT,
                                   XAVIER_INIT,
                                   ZEROS_INIT,
                                   scope='target_v')

            # two q functions
            q1_t_with_pi = q_function(params.fcs,
                                      self.obs_t_ph,
                                      squashed_action_t,
                                      params.concat_index,
                                      tf.nn.relu,
                                      XAVIER_INIT,
                                      XAVIER_INIT,
                                      ZEROS_INIT,
                                      scope='q1')
            q1_t = q_function(params.fcs,
                              self.obs_t_ph,
                              self.actions_t_ph,
                              params.concat_index,
                              tf.nn.relu,
                              XAVIER_INIT,
                              XAVIER_INIT,
                              ZEROS_INIT,
                              scope='q1')
            q2_t_with_pi = q_function(params.fcs,
                                      self.obs_t_ph,
                                      squashed_action_t,
                                      params.concat_index,
                                      tf.nn.relu,
                                      XAVIER_INIT,
                                      XAVIER_INIT,
                                      ZEROS_INIT,
                                      scope='q2')
            q2_t = q_function(params.fcs,
                              self.obs_t_ph,
                              self.actions_t_ph,
                              params.concat_index,
                              tf.nn.relu,
                              XAVIER_INIT,
                              XAVIER_INIT,
                              ZEROS_INIT,
                              scope='q2')

            # prepare for loss
            rewards_tp1 = tf.reshape(self.rewards_tp1_ph, [-1, 1])
            dones_tp1 = tf.reshape(self.dones_tp1_ph, [-1, 1])

            # value function loss
            self.v_loss = build_v_loss(v_t, q1_t_with_pi, q2_t_with_pi,
                                       log_prob_t)
            # q function loss
            self.q1_loss = build_q_loss(q1_t, rewards_tp1, v_tp1, dones_tp1,
                                        params.gamma)
            self.q2_loss = build_q_loss(q2_t, rewards_tp1, v_tp1, dones_tp1,
                                        params.gamma)
            # policy function loss
            self.pi_loss = build_pi_loss(log_prob_t, q1_t_with_pi,
                                         q2_t_with_pi)
            # policy reguralization
            policy_decay = build_policy_reg(pi_t, params.reg)

            # target update
            self.target_update = build_target_update('sac/v', 'sac/target_v',
                                                     params.tau)

            # optimization
            self.v_optimize_expr = build_optim(self.v_loss, params.v_lr,
                                               'sac/v')
            self.q1_optimize_expr = build_optim(self.q1_loss, params.q_lr,
                                                'sac/q1')
            self.q2_optimize_expr = build_optim(self.q2_loss, params.q_lr,
                                                'sac/q2')
            self.pi_optimize_expr = build_optim(self.pi_loss + policy_decay,
                                                params.pi_lr, 'sac/pi')

            # for inference
            self.action = squashed_action_t[0]
            self.value = tf.reshape(v_t, [-1])[0]
            self.log_prob = tf.reshape(log_prob_t, [-1])[0]
Example #3
0
    def _build(self, params):
        with tf.variable_scope('td3', reuse=tf.AUTO_REUSE):
            self.obs_t_ph = tf.placeholder(tf.float32,
                                           [None] + list(params.state_shape),
                                           name='obs_t')
            self.actions_t_ph = tf.placeholder(tf.float32,
                                               [None, params.num_actions],
                                               name='actions_t')
            self.rewards_tp1_ph = tf.placeholder(tf.float32, [None],
                                                 name='rewards_tp1')
            self.obs_tp1_ph = tf.placeholder(tf.float32,
                                             [None] + list(params.state_shape),
                                             name='obs_tp1')
            self.dones_tp1_ph = tf.placeholder(tf.float32, [None],
                                               name='dones_tp1')

            # policy function
            raw_policy_t = _policy_function(params, self.obs_t_ph, 'actor')
            policy_t = tf.nn.tanh(raw_policy_t)

            # target policy function
            raw_policy_tp1 = _policy_function(params, self.obs_tp1_ph,
                                              'target_actor')
            policy_tp1 = tf.nn.tanh(raw_policy_tp1)

            # target policy smoothing reguralization
            smoothed_policy_tp1 = build_smoothed_target(
                policy_tp1, params.target_noise_sigma,
                params.target_noise_clip)

            # first critic
            q1_t = _q_function(params, self.obs_t_ph, self.actions_t_ph,
                               'critic/1')
            q1_t_with_actor = _q_function(params, self.obs_t_ph, policy_t,
                                          'critic/1')

            # first target critic
            q1_tp1 = _q_function(params, self.obs_tp1_ph, smoothed_policy_tp1,
                                 'target_critic/1')

            # second critic
            q2_t = _q_function(params, self.obs_t_ph, self.actions_t_ph,
                               'critic/2')
            q2_t_with_actor = _q_function(params, self.obs_t_ph, policy_t,
                                          'critic/2')

            # second target critic
            q2_tp1 = _q_function(params, self.obs_tp1_ph, smoothed_policy_tp1,
                                 'target_critic/2')

            # prepare for loss calculation
            rewards_tp1 = tf.reshape(self.rewards_tp1_ph, [-1, 1])
            dones_tp1 = tf.reshape(self.dones_tp1_ph, [-1, 1])

            # critic loss
            target = build_target(rewards_tp1, q1_tp1, q2_tp1, dones_tp1,
                                  params.gamma)
            self.critic_loss = build_critic_loss(q1_t, q2_t, target)

            # actor loss
            self.actor_loss = -build_actor_loss(q1_t_with_actor,
                                                q2_t_with_actor)

            # target update
            self.update_target_critic = build_target_update(
                'td3/critic', 'td3/target_critic', params.tau)
            self.update_target_actor = build_target_update(
                'td3/actor', 'td3/target_actor', params.tau)

            # optimization
            self.critic_optimize_expr = build_optim(self.critic_loss,
                                                    params.critic_lr,
                                                    'td3/critic')
            self.actor_optimize_expr = build_optim(self.actor_loss,
                                                   params.actor_lr,
                                                   'td3/actor')

            # action
            self.action = policy_t
            self.value = tf.reshape(q1_t_with_actor, [-1])
Example #4
0
    def _build(self,
               fcs,
               concat_index,
               state_shape,
               num_actions,
               gamma,
               tau,
               pi_lr,
               q_lr,
               v_lr,
               reg):
        with tf.variable_scope('sac'):
            obs_t_ph = self.obs_t_ph = tf.placeholder(
                tf.float32, (None,) + state_shape, name='obs_t')
            actions_t_ph = self.actions_t_ph = tf.placeholder(
                tf.float32, (None, num_actions), name='actions_t')
            rewards_tp1_ph = self.rewards_tp1_ph = tf.placeholder(
                tf.float32, (None,), name='rewards_tp1')
            obs_tp1_ph = self.obs_tp1_ph = tf.placeholder(
                tf.float32, (None,) + state_shape, name='obs_tp1')
            dones_tp1_ph = self.dones_tp1_ph = tf.placeholder(
                tf.float32, (None,), name='dones_tp1')

            # initialzier
            zeros_init = tf.zeros_initializer()
            w_init = tf.contrib.layers.xavier_initializer()
            last_w_init = tf.contrib.layers.xavier_initializer()
            last_b_init = tf.contrib.layers.xavier_initializer()

            # policy function
            pi_t = stochastic_policy_function(fcs, obs_t_ph, num_actions,
                                              tf.nn.relu, share=True,
                                              w_init=w_init,
                                              last_w_init=last_w_init,
                                              last_b_init=last_b_init,
                                              scope='pi')
            sampled_action_t = pi_t.sample(1)[0]
            squashed_action_t = tf.nn.tanh(sampled_action_t)
            diff = tf.reduce_sum(
                tf.log(1 - squashed_action_t ** 2 + 1e-6),
                axis=1, keepdims=True)
            log_prob_t = tf.reshape(
                pi_t.log_prob(sampled_action_t), [-1, 1]) - diff

            # value function
            v_t = value_function(
                fcs, obs_t_ph, tf.nn.relu, w_init,
                last_w_init, zeros_init, scope='v')
            # target value function
            v_tp1 = value_function(
                fcs, obs_tp1_ph, tf.nn.relu, w_init,
                last_w_init, zeros_init, scope='target_v')

            # two q functions
            q1_t_with_pi = q_function(fcs, obs_t_ph, squashed_action_t,
                                      concat_index, tf.nn.relu, w_init,
                                      last_w_init, zeros_init, scope='q1')
            q1_t = q_function(fcs, obs_t_ph, actions_t_ph, concat_index,
                              tf.nn.relu, w_init, last_w_init,
                              zeros_init, scope='q1')
            q2_t_with_pi = q_function(fcs, obs_t_ph, squashed_action_t,
                                      concat_index, tf.nn.relu, w_init,
                                      last_w_init, zeros_init, scope='q2')
            q2_t = q_function(fcs, obs_t_ph, actions_t_ph, concat_index,
                              tf.nn.relu, w_init, last_w_init,
                              zeros_init, scope='q2')

            # prepare for loss
            rewards_tp1 = tf.reshape(rewards_tp1_ph, [-1, 1])
            dones_tp1 = tf.reshape(dones_tp1_ph, [-1, 1])

            # value function loss
            self.v_loss = build_v_loss(
                v_t, q1_t_with_pi, q2_t_with_pi, log_prob_t)
            # q function loss
            self.q1_loss = build_q_loss(
                q1_t, rewards_tp1, v_tp1, dones_tp1, gamma)
            self.q2_loss = build_q_loss(
                q2_t, rewards_tp1, v_tp1, dones_tp1, gamma)
            # policy function loss
            self.pi_loss = build_pi_loss(
                log_prob_t, q1_t_with_pi, q2_t_with_pi)

            # target update
            self.target_update = build_target_update(
                'sac/v', 'sac/target_v', tau)

            # policy reguralization
            pi_mean_loss = 0.5 * tf.reduce_mean(pi_t.mean() ** 2)
            pi_logstd_loss = 0.5 * tf.reduce_mean(tf.log(pi_t.stddev()) ** 2)
            policy_decay = reg * (pi_mean_loss + pi_logstd_loss)

            # optimization
            self.v_optimize_expr = build_optim(self.v_loss, v_lr, 'sac/v')
            self.q1_optimize_expr = build_optim(self.q1_loss, q_lr, 'sac/q1')
            self.q2_optimize_expr = build_optim(self.q2_loss, q_lr, 'sac/q2')
            self.pi_optimize_expr = build_optim(self.pi_loss + policy_decay,
                                                pi_lr, 'sac/pi')

            # for inference
            self.action = squashed_action_t[0]
            self.value = tf.reshape(v_t, [-1])[0]
            self.log_prob = tf.reshape(log_prob_t, [-1])[0]