コード例 #1
0
    def __init__(self, name, sess, state_space, action_space, lr=1e-4, gamma=0.99, tau=0.98, memory_size=10**6, batch_size=64):
        super().__init__(name, state_space, action_space)

        self.sess = sess
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.memory_size = memory_size
        self.batch_size = batch_size

        self.act_dim = flatten(action_space)

        self.state_ph = tf.placeholder(tf.float32, (None,) + state_space, name='state-ph')
        self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), name='act-ph')

        with tf.variable_scope('eval'):
            self.eval_scope = tf.get_variable_scope().name
            self.eval_net = self._construct(input_ph=self.state_ph, out_dim=self.act_dim)

        with tf.variable_scope('target'):
            self.target_scope = tf.get_variable_scope().name
            self.target_net = self._construct(input_ph=self.state_ph, out_dim=self.act_dim)

        with tf.name_scope('update'):
            eval_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.eval_scope)
            target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_scope)

            self.async_op = [tf.assign(t_var, e_var) for e_var, t_var in zip(eval_vars, target_vars)]
            self.soft_async_op = [tf.assign(t_var, self.tau * e_var + (1. - self.tau) * t_var) for e_var, t_var in zip(eval_vars, target_vars)]

        with tf.name_scope('optimization'):
            self.loss = None
            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        self.replay_buffer = Buffer(self.memory_size)
コード例 #2
0
    def __init__(self, name, sess, state_space, action_space, len_episode, actor_lr=1e-4, critic_lr=1e-3, gamma=0.96, epsilon=0.1, update_steps=5):
        super().__init__(name, state_space, action_space)

        self.sess = sess
        self.len_episode = len_episode

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.update_steps = update_steps
        self.act_dim = flatten(action_space)

        self.state_ph = tf.placeholder(tf.float32, (None,) + self.state_space, name='state-ph')
        self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), name='action-ph')
        self.adv_ph = tf.placeholder(tf.float32, (None,), name='advantage-ph')
        self.cum_r_ph = tf.placeholder(tf.float32, (None,), name='cum-r-ph')

        with tf.variable_scope('new_policy'):
            self.new_a_scope = tf.get_variable_scope().name
            self.new_logits = self._construct(input_ph=self.state_ph, out_dim=self.act_dim)

        with tf.variable_scope('old_policy'):
            self.old_a_scope = tf.get_variable_scope().name
            self.old_logits = self._construct(input_ph=self.state_ph, out_dim=self.act_dim)

        with tf.variable_scope('critic'):
            self.value = self._construct(input_ph=self.state_ph, out_dim=1)

        with tf.name_scope('optimization'):
            ratio = self._policy(self.new_logits) / self._policy(self.old_logits)
            self.a_loss = (tf.stop_gradient(self.value) - tf.reshape(self.cum_r_ph, (-1, 1)))
            self.c_loss = tf.reduce_mean(tf.squared_difference(self.value, self.cum_r_ph))

            self.a_train_op = tf.train.AdamOptimizer(self.actor_lr).minimize(self.a_loss)
            self.c_train_op = tf.train.AdamOptimizer(self.critic_lr).minimize(self.c_loss)

        with tf.name_scope('update'):
            new_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.new_a_scope)
            old_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.old_a_scope)

            self.async = [tf.assign(t_var, e_var) for e_var, t_var in zip(new_vars, old_vars)]

        self.episode = Episode(len_episode)
コード例 #3
0
    def __init__(self, name, sess, state_space, action_space, len_episode, gamma=0.99, actor_lr=1e-4, critic_lr=1e-3):
        super().__init__(name, state_space, action_space)

        self.sess = sess
        self.len_episode = len_episode
        self.gamma = gamma
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.act_dim = flatten(self.act_space)

        self.state_ph = tf.placeholder(tf.float32, (None,) + self.state_space, name='state-ph')
        self.act_ph = tf.placeholder(tf.int32, (None,), name='act-ph')
        self.cum_r_ph = tf.placeholder(tf.float32, (None,), name='cum_r-ph')

        with tf.variable_scope("emb"):
            emb_layer = self._emb(self.state_ph)

        with tf.variable_scope("policy"):
            self.actor_scope = tf.get_variable_scope().name
            self.policy_logits = self._construct(emb=emb_layer, out_dim=self.act_dim)
            self.policy = tf.nn.softmax(self.policy_logits, axis=1)

        with tf.variable_scope("value"):
            self.critic_scope = tf.get_variable_scope().name
            self.value = self._construct(emb=emb_layer, out_dim=1)

        with tf.name_scope('optimization'):
            act_one_hot = tf.one_hot(self.act_ph, self.act_dim)
            log_policy = tf.reduce_sum(tf.log(self.policy) * act_one_hot, axis=1)

            self.a_loss = -tf.reduce_sum(log_policy * (tf.stop_gradient(self.value) - self.cum_r))
            self.c_loss = tf.reduce_mean(tf.square(self.value - self.cum_r))

            self.a_train_op = tf.train.AdamOptimizer(self.actor_lr).minimize(self.a_loss)

            # stop update for observation embedding
            c_optimizer = tf.train.AdamOptimizer(self.critic_lr)
            c_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.critic_scope)
            grad_vars = optimizer.compute_gradients(self.c_loss, c_vars)

            self.c_train_op = c_optimizer.apply_gradients(grad_vars)

        self.episode = Episode(self.len_episode)
コード例 #4
0
ファイル: model.py プロジェクト: zhiweixutsinghua/ModelRepo
    def __init__(self, name, sess, state_space, act_space, lr=1e-3, gamma=0.99, use_double=True, use_dueling=True,
                 tau=0.01, batch_size=64, policy_type='e_greedy', memory_size=10**6):
        super(DQN, self).__init__(name, state_space, act_space)

        self.sess = sess

        self.lr = lr
        self.gamma = gamma
        self.use_double = use_double
        self.use_dueling = use_dueling
        self.tau = tau
        self.act_dim = flatten(act_space)
        self.batch_size = batch_size
        self.policy_type = policy_type

        self.replay_buffer = Buffer(memory_size)

        self.state_ph = tf.placeholder(tf.float32, (None,) + self.state_space, name='state-ph')
        self.target_q_ph = tf.placeholder(tf.float32, (None,), name='target-q-ph')
        self.act_ph = tf.placeholder(tf.int32, (None,), name='act-ph')

        with tf.variable_scope('eval-net'):
            self.eval_scope = tf.get_variable_scope().name
            self.eval_q_tf = self._construct(input_ph=self.state_ph, out_dim=self.act_dim)

            one_hot = tf.one_hot(self.act_ph, self.act_dim)
            self.selected_q_tf = tf.reduce_sum(self.eval_q_tf * one_hot, axis=1, name='selected-q-tf')

        with tf.variable_scope('target-net'):
            self.target_scope = tf.get_variable_scope().name
            self.target_q_tf = self._construct(input_ph=self.state_ph, out_dim=self.act_dim)

        with tf.name_scope('update'):
            e_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.eval_scope)
            t_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_scope)

            self.sync = [tf.assign(t_var, e_var) for t_var, e_var in zip(e_vars, t_vars)]
            self.soft_sync = [tf.assign(t_var, self.tau * e_var + (1. - self.tau) * t_var) for t_var, e_var
                               in zip(e_vars, t_vars)]

        with tf.name_scope('optimization'):
            self.loss = 0.5 * tf.reduce_mean(tf.square(self.selected_q_tf - self.target_q_ph))
            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        with tf.name_scope('cal_target'):
            if self.use_double:
                act = tf.argmax(self.eval_q_tf, axis=1)
                one_hot = tf.one_hot(act, self.act_dim)
                self.reduced_target_tf = tf.reduce_sum(self.target_q_tf * one_hot, axis=1)
            else:
                self.reduced_target_tf = tf.reduce_max(self.target_q_tf, axis=1)

        with tf.name_scope('policy'):
            self.exploration_ph = tf.placeholder(tf.float32, None, name='exploration-ph')

            if self.policy_type == 'e_greedy':
                self.policy = tf.random.uniform((1,), minval=0., maxval=1.) < self.exploration_ph
            elif self.policy_type == 'boltzman':
                self.policy = tf.nn.softmax(self.eval_q_tf / self.exploration_ph)
            else:
                raise NotImplementedError
コード例 #5
0
    def __init__(self,
                 name,
                 sess,
                 state_space,
                 action_space,
                 gamma=0.98,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 memory_size=10**3,
                 batch_size=64,
                 tau=0.01,
                 grad_norm=5.0):
        super(DDPG, self).__init__(name, state_space, action_space)

        self.sess = sess
        self.gamma = gamma
        self.a_lr = actor_lr
        self.c_lr = critic_lr
        self.tau = tau
        self.batch_size = batch_size

        self.act_dim = flatten(action_space)
        self.replay_buffer = Buffer(memory_size)

        self.state_ph = tf.placeholder(tf.float32, (None, ) + state_space,
                                       name='state-ph')
        self.next_state_ph = tf.placeholder(tf.float32, (None, ) + state_space,
                                            name='next_state-ph')
        self.reward_ph = tf.placeholder(tf.float32, (None, ), name='reward-ph')
        self.done_ph = tf.placeholder(tf.float32, (None, ), name='done-ph')

        with tf.variable_scope('policy'):
            p_scope = tf.get_variable_scope().name
            self.logits = self._construct(input_ph=self.state_ph,
                                          out_dim=self.act_dim)
            self.policy = tf.nn.softmax(self.logits)

        with tf.variable_scope('target_policy'):
            t_p_scope = tf.get_variable_scope().name
            self.target_logits = self._construct(input_ph=self.next_state_ph,
                                                 out_dim=self.act_dim)
            self.target_policy = tf.nn.softmax(self.target_logits)

        with tf.variable_scope('value'):
            q_scope = tf.get_variable_scope().name
            self.q = self._construct(input_ph=tf.concat(
                [self.state_ph, self.policy], axis=1),
                                     out_dim=1)

        with tf.variable_scope('target_value'):
            t_q_scope = tf.get_variable_scope().name
            self.target_q = self._construct(input_ph=tf.concat(
                [self.next_state_ph, self.target_policy], axis=1),
                                            out_dim=1)

            self.next_q = gamma * (1. - self.done_ph) * tf.reshape(
                self.target_q, (-1, )) + self.reward_ph

        with tf.name_scope('update'):
            e_p_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=p_scope)
            t_p_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=t_p_scope)

            e_q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=q_scope)
            t_q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=t_q_scope)

            self.sync = [
                tf.assign(t_var, e_var)
                for e_var, t_var in zip(e_p_vars, t_p_vars)
            ] + [
                tf.assign(t_var, e_var)
                for e_var, t_var in zip(e_q_vars, t_q_vars)
            ]

            self.soft_sync = [
                tf.assign(t_var, tau * e_var + (1. - tau) * t_var)
                for e_var, t_var in zip(e_p_vars, t_p_vars)
            ] + [
                tf.assign(t_var, tau * e_var + (1. - tau) * t_var)
                for e_var, t_var in zip(e_q_vars, t_q_vars)
            ]

        with tf.name_scope('optimization'):
            policy_loss = -tf.reduce_mean(self.q)
            value_loss = tf.reduce_mean(
                tf.square(
                    tf.stop_gradient(self.next_q) -
                    tf.reshape(self.q, (-1, ))))

            optimizer = tf.train.AdamOptimizer(self.a_lr)
            grad_vars = optimizer.compute_gradients(
                policy_loss,
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  scope=p_scope))
            grad_vars = [(tf.clip_by_value(grad, -1., 1.), _var)
                         for grad, _var in grad_vars]
            self.p_train = optimizer.apply_gradients(grad_vars)

            optimizer = tf.train.AdamOptimizer(self.c_lr)
            grad_vars = optimizer.compute_gradients(
                value_loss,
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  scope=q_scope))
            self.c_train = optimizer.apply_gradients(grad_vars)

            self.a_loss = policy_loss
            self.c_loss = value_loss