Beispiel #1
0
    def __init__(self):
        self.sess = tf.Session()
        self.memory = replay_buffer(max_length=1e5)
        self.tau = 0.995
        self.gamma = 0.99
        self.state_size = 33
        self.output_size = 4
        self.action_limit = 1.0
        self.hidden = [400, 300]
        self.batch_size = 100
        self.pi_lr = 1e-4
        self.q_lr = 1e-4
        self.noise = OU_noise(self.output_size, 1)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.mlp_actor_critic(
                self.x_ph,
                self.a_ph,
                self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\
                self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh,
                output_size=self.output_size, action_limit=self.action_limit)

        self.target = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.q_pi_targ)
        self.pi_loss = -tf.reduce_mean(self.q_pi)
        self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5
        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.v_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(
            self.pi_loss, var_list=cr.get_vars('main/pi'))
        self.v_train = self.v_optimizer.minimize(
            self.v_loss, var_list=cr.get_vars('main/q'))

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #2
0
class SAC:
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        self.num_worker = 20
        self.noise = OU_noise(self.output_size, self.num_worker)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \
                cr.sac_mlp_actor_critic(
                    x=self.x_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, self.v_targ = \
                cr.sac_mlp_actor_critic(
                    x=self.x2_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')

        self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi)
        self.q_backup = tf.stop_gradient(self.r_ph + self.gamma *
                                         (1 - self.d_ph) * self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q_pi -
                                         self.alpha * self.logp_pi)

        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi)
        self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2)
        self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2)
        self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2)
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss,
                                                      var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1,
            self.q2, self.v, self.logp_pi, self.train_pi_op,
            self.train_value_op, self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)

    def update(self):
        data = self.memory.get_sample(sample_size=self.batch_size)
        feed_dict = {
            self.x_ph: data['state'],
            self.x2_ph: data['next_state'],
            self.a_ph: data['action'],
            self.r_ph: data['reward'],
            self.d_ph: data['done']
        }

        self.sess.run(self.step_ops, feed_dict=feed_dict)

    def get_action(self, state, deterministic=False):
        act_op = self.mu if deterministic else self.pi
        return self.sess.run(act_op, feed_dict={self.x_ph: [state]})[0]

    def test(self):
        env = gym.make('Pendulum-v0')
        while True:
            state = env.reset()
            done = False
            while not done:
                env.render()
                action = self.get_action(state, 0)
                state, _, done, _ = env.step(action)

    def run(self):
        from mlagents.envs import UnityEnvironment

        writer = SummaryWriter('runs/sac')
        num_worker = self.num_worker
        state_size = self.state_size
        output_size = self.output_size
        ep = 0
        train_size = 5

        env = UnityEnvironment(file_name='env/training', worker_id=1)
        default_brain = env.brain_names[0]
        brain = env.brains[default_brain]
        initial_observation = env.reset()

        step = 0
        start_steps = 100000

        states = np.zeros([num_worker, state_size])
        for i in range(start_steps):
            actions = np.clip(np.random.randn(num_worker, output_size),
                              -self.action_limit, self.action_limit)
            actions += self.noise.sample()
            env_info = env.step(actions)[default_brain]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            for s, ns, r, d, a in zip(states, next_states, rewards, dones,
                                      actions):
                self.memory.append(s, ns, r, d, a)
            states = next_states
            if dones[0]:
                self.noise.reset()
            if i % train_size == 0:
                if len(self.memory.memory) > self.batch_size:
                    self.update()
            print('data storing :', float(i / start_steps))

        while True:
            ep += 1
            states = np.zeros([num_worker, state_size])
            terminal = False
            score = 0
            while not terminal:
                step += 1
                '''
                if step > start_steps:
                    actions = [self.get_action(s) for s in states]
                    action_random = 'False'
                else:
                    actions = np.clip(np.random.randn(num_worker, output_size), -self.action_limit, self.action_limit)
                    action_random = 'True'
                '''
                actions = [self.get_action(s) for s in states]
                action_random = 'False'

                env_info = env.step(actions)[default_brain]

                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                terminal = dones[0]

                for s, ns, r, d, a in zip(states, next_states, rewards, dones,
                                          actions):
                    self.memory.append(s, ns, r, d, a)

                score += sum(rewards)

                states = next_states

                if len(self.memory.memory) > self.batch_size:
                    if step % train_size == 0:
                        self.update()

            if ep < 1000:
                print('step : ', step, '| start steps : ', start_steps,
                      '| episode :', ep, '| score : ', score, '| memory size',
                      len(self.memory.memory), '| action random : ',
                      action_random)
                writer.add_scalar('data/reward', score, ep)
                writer.add_scalar('data/memory_size', len(self.memory.memory),
                                  ep)
Beispiel #3
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        self.num_worker = 20
        self.noise = OU_noise(self.output_size, self.num_worker)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \
                cr.sac_mlp_actor_critic(
                    x=self.x_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, self.v_targ = \
                cr.sac_mlp_actor_critic(
                    x=self.x2_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')

        self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi)
        self.q_backup = tf.stop_gradient(self.r_ph + self.gamma *
                                         (1 - self.d_ph) * self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q_pi -
                                         self.alpha * self.logp_pi)

        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi)
        self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2)
        self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2)
        self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2)
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss,
                                                      var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1,
            self.q2, self.v, self.logp_pi, self.train_pi_op,
            self.train_value_op, self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
Beispiel #4
0
class TD3:
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise = OU_noise(self.output_size, 1)
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target', reuse=True):
            self.eps = tf.random_normal(tf.shape(self.pi_targ),
                                        stddev=self.target_noise)
            self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip,
                                            self.noise_clip)
            self.a_prev = self.pi_targ + self.epsilon
            self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit,
                                       self.action_limit)
            _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a2,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.q_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(self.pi_loss,
                                                   var_list=self.pi_params)
        self.v_train = self.pi_optimizer.minimize(self.v_loss,
                                                  var_list=self.q_params)

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)

    def update(self):
        data = self.memory.get_sample(sample_size=self.batch_size)
        feed_dict = {
            self.x_ph: data['state'],
            self.x2_ph: data['next_state'],
            self.a_ph: data['action'],
            self.r_ph: data['reward'],
            self.d_ph: data['done']
        }

        q_loss, _ = self.sess.run([self.v_loss, self.v_train],
                                  feed_dict=feed_dict)
        pi_loss, _, _ = self.sess.run(
            [self.pi_loss, self.pi_train, self.target_update],
            feed_dict=feed_dict)

        return q_loss, pi_loss

    def get_action(self, state, epsilon):
        a = self.sess.run(self.pi, feed_dict={self.x_ph: [state]})
        a += epsilon * self.noise.sample()
        return np.clip(a, -self.action_limit, self.action_limit)[0]

    def test(self):
        env = gym.make('Pendulum-v0')
        while True:
            state = env.reset()
            done = False
            while not done:
                env.render()
                action = self.get_action(state, 0)
                next_state, _, done, _ = env.step(action)
                state = next_state

    def run(self):
        from mlagents.envs import UnityEnvironment

        writer = SummaryWriter('runs/td3')
        num_worker = 20
        state_size = 33
        output_size = 4
        epsilon = 1.0
        ep = 0
        train_size = 5

        env = UnityEnvironment(file_name='env/training', worker_id=0)
        default_brain = env.brain_names[0]
        brain = env.brains[default_brain]
        initial_observation = env.reset()

        step = 0
        score = 0

        while True:
            ep += 1
            env_info = env.reset()
            states = np.zeros([num_worker, state_size])
            terminal = False
            self.noise.reset()
            if epsilon > 0.001:
                epsilon = -ep * 0.005 + 1.0
            while not terminal:
                step += 1

                actions = [self.get_action(s, epsilon) for s in states]
                env_info = env.step(actions)[default_brain]

                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                terminal = dones[0]

                for s, ns, r, d, a in zip(states, next_states, rewards, dones,
                                          actions):
                    self.memory.append(s, ns, r, d, a)

                score += sum(rewards)

                states = next_states

                if step % train_size == 0:
                    self.update()

            if ep < 1000:
                print('episode :', ep, '| score : ', score, '| epsilon :',
                      epsilon)
                writer.add_scalar('data/reward', score, ep)
                writer.add_scalar('data/epsilon', epsilon, ep)
                writer.add_scalar('data/memory_size', len(self.memory.memory),
                                  ep)
                score = 0
Beispiel #5
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise = OU_noise(self.output_size, 1)
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target', reuse=True):
            self.eps = tf.random_normal(tf.shape(self.pi_targ),
                                        stddev=self.target_noise)
            self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip,
                                            self.noise_clip)
            self.a_prev = self.pi_targ + self.epsilon
            self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit,
                                       self.action_limit)
            _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a2,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.q_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(self.pi_loss,
                                                   var_list=self.pi_params)
        self.v_train = self.pi_optimizer.minimize(self.v_loss,
                                                  var_list=self.q_params)

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #6
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 32
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.noise = OU_noise(self.output_size, self.worker_size)
        self.kappa = 1.0
        self.risk_factor = 0

        self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size,self.state_size, None, None)
        self.risk_factor_ph = tf.placeholder(tf.float32)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.dqpg_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        with tf.variable_scope('target'):
            _, _, self.q_pi_targ = cr.dqpg_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\
                                       + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.q_pi_targ)
        self.quantile_weight = 1.0 - self.risk_factor_ph*\
            (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0)
        self.pi_loss = -tf.reduce_mean(
            tf.reduce_mean(self.q_pi * self.quantile_weight))

        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        tau = tf.reshape(
            tf.range(0.5 / self.support_size, 1, 1 / self.support_size),
            [1, self.support_size])
        tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2),
                                  [1, 1, self.support_size])
        Huber_loss = tf.losses.huber_loss(logit_valid_tile,
                                          theta_loss_tile,
                                          reduction=tf.losses.Reduction.NONE,
                                          delta=self.kappa)
        bellman_errors = logit_valid_tile - theta_loss_tile
        Loss = (tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0)))
                * Huber_loss) / self.kappa
        self.v_loss = tf.reduce_mean(
            tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1))

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
Beispiel #7
0
class DQPG:
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 32
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.noise = OU_noise(self.output_size, self.worker_size)
        self.kappa = 1.0
        self.risk_factor = 0

        self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size,self.state_size, None, None)
        self.risk_factor_ph = tf.placeholder(tf.float32)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.dqpg_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        with tf.variable_scope('target'):
            _, _, self.q_pi_targ = cr.dqpg_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\
                                       + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.q_pi_targ)
        self.quantile_weight = 1.0 - self.risk_factor_ph*\
            (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0)
        self.pi_loss = -tf.reduce_mean(
            tf.reduce_mean(self.q_pi * self.quantile_weight))

        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        tau = tf.reshape(
            tf.range(0.5 / self.support_size, 1, 1 / self.support_size),
            [1, self.support_size])
        tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2),
                                  [1, 1, self.support_size])
        Huber_loss = tf.losses.huber_loss(logit_valid_tile,
                                          theta_loss_tile,
                                          reduction=tf.losses.Reduction.NONE,
                                          delta=self.kappa)
        bellman_errors = logit_valid_tile - theta_loss_tile
        Loss = (tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0)))
                * Huber_loss) / self.kappa
        self.v_loss = tf.reduce_mean(
            tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1))

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)

    def update(self):
        data = self.memory.get_sample(sample_size=self.batch_size)
        feed_dict = {
            self.x_ph: data['state'],
            self.x2_ph: data['next_state'],
            self.a_ph: data['action'],
            self.r_ph: data['reward'],
            self.d_ph: data['done'],
            self.risk_factor_ph: self.risk_factor
        }
        pi_loss, q_loss, _, _, _ = self.sess.run(self.step_ops,
                                                 feed_dict=feed_dict)
        return q_loss, pi_loss

    def get_action(self, state, epsilon):
        a = self.sess.run(self.pi, feed_dict={self.x_ph: state})
        a += epsilon * self.noise.sample()
        return np.clip(a, -self.action_limit, self.action_limit)

    def run_gym(self):
        import mujoco_py
        import gym

        writer = SummaryWriter('runs/dqpg_' + env_set['env_name'] + '_' +
                               "{:1.1f}".format(self.risk_factor))

        epsilon = 0.1
        ep = 0

        env = gym.make(env_set['env_name'])

        step = 0
        end_ep = env_set['ep_len']
        train_start_ep = 5
        vs = []
        ps = []
        scores = deque(maxlen=10)
        score = 0
        state = env.reset()
        while True:
            ep += 1
            if epsilon > 0.05:
                epsilon = -ep * 2.0 / end_ep + 1.0
            for i in range(1000):
                if ep > end_ep:
                    env.render()
                step += 1

                action = self.get_action([state], epsilon)
                next_state, reward, done, info = env.step(action)

                self.memory.append(state, next_state, reward, done, action[0])
                score += reward
                state = next_state

                if ep > train_start_ep:
                    v, p = self.update()
                    vs.append(v)
                    ps.append(p)

                if done:
                    scores.append(score)
                    score = 0
                    state = env.reset()

            if ep < end_ep:
                print('episode :', ep, '| score : ',
                      "{0:.2f}".format(np.mean(scores)), '| epsilon :',
                      "{0:.2f}".format(epsilon), " | v :",
                      "{0:.2f}".format(np.mean(vs)), " | p :",
                      "{0:.2f}".format(np.mean(ps)))
                writer.add_scalar('data/reward', np.mean(scores), ep)
                writer.add_scalar('data/epsilon', epsilon, ep)
                writer.add_scalar('data/memory_size', len(self.memory.memory),
                                  ep)
                writer.add_scalar('loss/value', np.mean(vs), ep)
                writer.add_scalar('loss/policy', np.mean(ps), ep)
                vs.clear()
                ps.clear()

    def run_unity(self):
        from mlagents.envs import UnityEnvironment

        writer = SummaryWriter('runs/dqpg_' + env_set['env_name'] + '_' +
                               "{:1.1f}".format(self.risk_factor))
        epsilon = 1.0
        ep = 0

        env = UnityEnvironment(file_name='env/' + env_set['env_name'],
                               worker_id=0)
        default_brain = env.brain_names[0]
        env_info = env.reset()[default_brain]

        step = 0
        scores = np.zeros([self.worker_size])
        score = deque(maxlen=10)
        end_ep = env_set['ep_len']
        train_start_ep = 5
        vs = []
        ps = []

        while True:
            ep += 1
            if ep == end_ep + 1:
                env_info = env.reset(train_mode=False)[default_brain]
            states = env_info.vector_observations
            if epsilon > 0.05:
                epsilon = -ep * 2.0 / end_ep + 1.0
            for i in range(1000):
                step += 1

                actions = self.get_action(states, epsilon)
                env_info = env.step(actions)[default_brain]
                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                for s, ns, r, d, a in zip(states, next_states, rewards, dones,
                                          actions):
                    self.memory.append(s, ns, r, d, a)

                scores += rewards

                states = next_states
                for idx, d in enumerate(dones):
                    if d:
                        score.append(scores[idx])
                        scores[idx] = 0

                if ep > train_start_ep:
                    v, p = self.update()
                    vs.append(v)
                    ps.append(p)

            if ep < end_ep:
                print('episode :', ep, '| score : ',
                      "{0:.2f}".format(np.mean(score)), '| epsilon :',
                      "{0:.2f}".format(epsilon), " | v :",
                      "{0:.2f}".format(np.mean(vs)), " | p :",
                      "{0:.2f}".format(np.mean(ps)))
                writer.add_scalar('data/reward', np.mean(score), ep)
                writer.add_scalar('data/epsilon', epsilon, ep)
                writer.add_scalar('data/memory_size', len(self.memory.memory),
                                  ep)
                writer.add_scalar('loss/value', np.mean(vs), ep)
                writer.add_scalar('loss/policy', np.mean(ps), ep)
                vs.clear()
                ps.clear()