Beispiel #1
0
    def __init__(self):
        self.sess = tf.Session()
        self.memory = replay_buffer(max_length=1e5)
        self.tau = 0.995
        self.gamma = 0.99
        self.state_size = 33
        self.output_size = 4
        self.action_limit = 1.0
        self.hidden = [400, 300]
        self.batch_size = 100
        self.pi_lr = 1e-4
        self.q_lr = 1e-4
        self.noise = OU_noise(self.output_size, 1)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.mlp_actor_critic(
                self.x_ph,
                self.a_ph,
                self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\
                self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh,
                output_size=self.output_size, action_limit=self.action_limit)

        self.target = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.q_pi_targ)
        self.pi_loss = -tf.reduce_mean(self.q_pi)
        self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5
        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.v_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(
            self.pi_loss, var_list=cr.get_vars('main/pi'))
        self.v_train = self.v_optimizer.minimize(
            self.v_loss, var_list=cr.get_vars('main/q'))

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #2
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        self.num_worker = 20
        self.noise = OU_noise(self.output_size, self.num_worker)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \
                cr.sac_mlp_actor_critic(
                    x=self.x_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, self.v_targ = \
                cr.sac_mlp_actor_critic(
                    x=self.x2_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')

        self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi)
        self.q_backup = tf.stop_gradient(self.r_ph + self.gamma *
                                         (1 - self.d_ph) * self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q_pi -
                                         self.alpha * self.logp_pi)

        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi)
        self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2)
        self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2)
        self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2)
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss,
                                                      var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1,
            self.q2, self.v, self.logp_pi, self.train_pi_op,
            self.train_value_op, self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
Beispiel #3
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise = OU_noise(self.output_size, 1)
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target', reuse=True):
            self.eps = tf.random_normal(tf.shape(self.pi_targ),
                                        stddev=self.target_noise)
            self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip,
                                            self.noise_clip)
            self.a_prev = self.pi_targ + self.epsilon
            self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit,
                                       self.action_limit)
            _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a2,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.q_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(self.pi_loss,
                                                   var_list=self.pi_params)
        self.v_train = self.pi_optimizer.minimize(self.v_loss,
                                                  var_list=self.q_params)

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #4
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 32
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.noise = OU_noise(self.output_size, self.worker_size)
        self.kappa = 1.0
        self.risk_factor = 0

        self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size,self.state_size, None, None)
        self.risk_factor_ph = tf.placeholder(tf.float32)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.dqpg_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        with tf.variable_scope('target'):
            _, _, self.q_pi_targ = cr.dqpg_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\
                                       + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.q_pi_targ)
        self.quantile_weight = 1.0 - self.risk_factor_ph*\
            (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0)
        self.pi_loss = -tf.reduce_mean(
            tf.reduce_mean(self.q_pi * self.quantile_weight))

        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        tau = tf.reshape(
            tf.range(0.5 / self.support_size, 1, 1 / self.support_size),
            [1, self.support_size])
        tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2),
                                  [1, 1, self.support_size])
        Huber_loss = tf.losses.huber_loss(logit_valid_tile,
                                          theta_loss_tile,
                                          reduction=tf.losses.Reduction.NONE,
                                          delta=self.kappa)
        bellman_errors = logit_valid_tile - theta_loss_tile
        Loss = (tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0)))
                * Huber_loss) / self.kappa
        self.v_loss = tf.reduce_mean(
            tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1))

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)