Exemple #1
0
    def __init__(self, a_dim, s_dim):
        self.a_dim = a_dim
        self.s_dim = s_dim
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_capacity
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment

        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.learn_step_counter = 0

        self.memory = Memory(capacity=memory_capacity)
        #        self.memory = np.zeros((self.memory_size, self.s_dim * 2 + 2))
        self.ISWeights = tf.placeholder(tf.float32, [None, 1], 'ISWeights')

        self._build_net()

        self.t_params = tf.get_collection('target_net_params')
        self.e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [
            tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)
        ]
        #        self.global_steps = tf.Variable(0, trainable=False)
        #        self.global_steps = 2000
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.t_params = self.sess.run(self.t_params)
        self.e_params = self.sess.run(self.e_params)

        self.cost_history = []
Exemple #2
0
    def __init__(
        self,
        a_dim,
        s_dim,
        a_bound,
    ):
        #        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
        self.memory = Memory(capacity=MEMORY_CAPACITY)
        self.pointer = 0
        self.sess = tf.Session()

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        #        self.ISWeight = tf.placeholder(tf.float32, [None, 1], 'ISWeight')

        with tf.variable_scope('Actor'):
            self.a = self._build_a(self.S, scope='eval', trainable=True)
            a_ = self._build_a(self.S_, scope='target', trainable=False)
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            q = self._build_c(self.S, self.a, scope='eval', trainable=True)
            q_ = self._build_c(self.S_, a_, scope='target', trainable=False)

        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Critic/target')

        # target net replacement
        self.soft_replace = [[
            tf.assign(ta, (1 - TAU) * ta + TAU * ea),
            tf.assign(tc, (1 - TAU) * tc + TAU * ec)
        ] for ta, ea, tc, ec in zip(self.at_params, self.ae_params,
                                    self.ct_params, self.ce_params)]

        q_target = self.R + GAMMA * q_
        # in the feed_dic for the td_error, the self.a should change to actions in memory
        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
        self.td_error_up = abs(q_target - q)
        self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(
            td_error, var_list=self.ce_params)

        a_loss = -tf.reduce_mean(q)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(
            a_loss, var_list=self.ae_params)

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(max_to_keep=MAX_EPISODES)
class DDPG(object):
    def __init__(
        self,
        a_dim,
        s_dim,
        a_bound,
    ):
        #        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
        self.memory = Memory(capacity=MEMORY_CAPACITY)
        self.pointer = 0
        self.sess = tf.Session()

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        self.ISWeights = tf.placeholder(tf.float32, [None, 1], 'ISWeights')

        with tf.variable_scope('Actor'):
            self.a = self._build_a(self.S, scope='eval', trainable=True)
            a_ = self._build_a(self.S_, scope='target', trainable=False)
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            q = self._build_c(self.S, self.a, scope='eval', trainable=True)
            q_ = self._build_c(self.S_, a_, scope='target', trainable=False)

        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Critic/target')

        # target net replacement
        self.soft_replace = [[
            tf.assign(ta, (1 - TAU) * ta + TAU * ea),
            tf.assign(tc, (1 - TAU) * tc + TAU * ec)
        ] for ta, ea, tc, ec in zip(self.at_params, self.ae_params,
                                    self.ct_params, self.ce_params)]

        q_target = self.R + GAMMA * q_
        # in the feed_dic for the td_error, the self.a should change to actions in memory
        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
        self.td_error_up = abs(q_target - q) * self.ISWeights
        self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(
            td_error * self.ISWeights, var_list=self.ce_params)

        a_loss = tf.reduce_mean(q)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(
            a_loss, var_list=self.ae_params)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]

    def learn(self):
        # soft target replacement
        self.sess.run(self.soft_replace)

        #        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        #        bt = self.memory[indices, :]
        tree_index, bt, ISWeights = self.memory.sample(BATCH_SIZE)
        bs = bt[:, :self.s_dim]
        ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
        br = bt[:, -self.s_dim - 1:-self.s_dim]
        bs_ = bt[:, -self.s_dim:]

        self.sess.run(self.atrain, {self.S: bs})
        self.sess.run(
            self.ctrain, {
                self.S: bs,
                self.a: ba,
                self.R: br,
                self.S_: bs_,
                self.ISWeights: ISWeights
            })

        abs_td_error = self.sess.run(
            self.td_error_up, {
                self.S: bs,
                self.a: ba,
                self.R: br,
                self.S_: bs_,
                self.ISWeights: ISWeights
            })
        self.memory.batch_update(tree_index, abs_td_error)

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, r, s_))
        #        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        #        self.memory[index, :] = transition
        self.memory.store(transition)
        self.pointer += 1

    def _build_a(self, s, scope, trainable):
        with tf.variable_scope(scope):
            net1 = tf.layers.dense(s,
                                   200,
                                   activation=tf.nn.relu,
                                   name='l1',
                                   trainable=trainable)
            net2 = tf.layers.dense(net1,
                                   100,
                                   activation=tf.nn.relu,
                                   name='l2',
                                   trainable=trainable)
            net3 = tf.layers.dense(net2,
                                   50,
                                   activation=tf.nn.relu,
                                   name='l3',
                                   trainable=trainable)
            a = tf.layers.dense(net3,
                                self.a_dim,
                                activation=tf.nn.sigmoid,
                                name='a',
                                trainable=trainable)
            return tf.multiply(a, self.a_bound, name='scaled_a')

    def _build_c(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            n_l1 = 200
            n_l2 = 100
            n_l3 = 50
            w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1],
                                   trainable=trainable)
            w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1],
                                   trainable=trainable)
            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
            w2 = tf.get_variable('w2', [n_l1, n_l2], trainable=trainable)
            b2 = tf.get_variable('b2', [1, n_l2], trainable=trainable)
            w3 = tf.get_variable('w3', [n_l2, n_l3], trainable=trainable)
            b3 = tf.get_variable('b3', [1, n_l3], trainable=trainable)
            net1 = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            net2 = tf.nn.relu(tf.matmul(net1, w2) + b2)
            net3 = tf.nn.relu(tf.matmul(net2, w3) + b3)
            return tf.layers.dense(net3, 1, trainable=trainable)  # Q(s,a)

    def load_partial_weights(self):
        for tv in tf.trainable_variables():
            print(tv.name)
        variables_to_restore = slim.get_variables_to_restore(include=[
            'Actor/eval/l1', 'Actor/eval/l2', 'Actor/eval/l3',
            'Critic/eval/w1_s', 'Critic/eval/w1_a', 'Critic/eval/b1',
            'Critic/eval/w2', 'Critic/eval/b2', 'Critic/eval/w3',
            'Critic/eval/b3'
        ])
        self.saver = tf.train.Saver(variables_to_restore)
        self.saver.restore(
            self.sess, os.path.join('Checkpoints/Prius', 'save_net.ckpt-500'))

    def savemodel(self):
        self.saver = tf.train.Saver(max_to_keep=MAX_EPISODES)
        self.saver.save(self.sess,
                        'Checkpoints/Series_transfer/save_net.ckpt',
                        global_step=step_episode)
Exemple #4
0
class DeepQNetwork():
    def __init__(self, a_dim, s_dim):
        self.a_dim = a_dim
        self.s_dim = s_dim
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_capacity
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment

        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.learn_step_counter = 0

        self.memory = Memory(capacity=memory_capacity)
        #        self.memory = np.zeros((self.memory_size, self.s_dim * 2 + 2))
        self.ISWeights = tf.placeholder(tf.float32, [None, 1], 'ISWeights')

        self._build_net()

        self.t_params = tf.get_collection('target_net_params')
        self.e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [
            tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)
        ]
        #        self.global_steps = tf.Variable(0, trainable=False)
        #        self.global_steps = 2000
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.t_params = self.sess.run(self.t_params)
        self.e_params = self.sess.run(self.e_params)

        self.cost_history = []

    def _build_net(self):
        #--------------------build eval_net------------------
        self.s = tf.placeholder(tf.float32, [None, self.s_dim], name='s')
        self.q_target = tf.placeholder(tf.float32, [None, self.a_dim],
                                       name='q_target')

        with tf.variable_scope('eval_net'):
            c_names, n_unit, w_initializer, b_initializer = \
            ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 200,\
            tf.random_normal_initializer(0, 0.3), tf.constant_initializer(0.1)

            with tf.variable_scope('layer1'):
                w1 = tf.get_variable('w1', [self.s_dim, n_unit],
                                     initializer=w_initializer,
                                     collections=c_names)
                b1 = tf.get_variable('b1', [1, n_unit],
                                     initializer=b_initializer,
                                     collections=c_names)
                layer1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)

            with tf.variable_scope('layer2'):
                w2 = tf.get_variable('w2', [200, 100],
                                     initializer=w_initializer,
                                     collections=c_names)
                b2 = tf.get_variable('b2', [1, 100],
                                     initializer=b_initializer,
                                     collections=c_names)
                layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2)

            with tf.variable_scope('layer3'):
                w3 = tf.get_variable('w3', [100, 50],
                                     initializer=w_initializer,
                                     collections=c_names)
                b3 = tf.get_variable('b3', [1, 50],
                                     initializer=b_initializer,
                                     collections=c_names)
                layer3 = tf.nn.relu(tf.matmul(layer2, w3) + b3)

            with tf.variable_scope('layer4'):
                w4 = tf.get_variable('w4', [50, self.a_dim],
                                     initializer=w_initializer,
                                     collections=c_names)
                b4 = tf.get_variable('b4', [1, self.a_dim],
                                     initializer=b_initializer,
                                     collections=c_names)
                self.q_eval = tf.matmul(layer3, w4) + b4

        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(
                tf.squared_difference(self.q_target,
                                      self.q_eval)) * self.ISWeights

        with tf.variable_scope('train_op'):
            self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
                self.loss)

        #-----------------build target network  --------------------
        self.s_ = tf.placeholder(tf.float32, [None, self.s_dim], name='s_')
        with tf.variable_scope('target_net'):
            c_names, n_unit, w_initializer, b_initializer = \
            ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 200,\
            tf.random_normal_initializer(0, 0.3), tf.constant_initializer(0.1)

            with tf.variable_scope('layer1'):
                w1 = tf.get_variable('w1', [self.s_dim, n_unit],
                                     initializer=w_initializer,
                                     collections=c_names)
                b1 = tf.get_variable('b1', [1, n_unit],
                                     initializer=b_initializer,
                                     collections=c_names)
                layer1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)

            with tf.variable_scope('layer2'):
                w2 = tf.get_variable('w2', [200, 100],
                                     initializer=w_initializer,
                                     collections=c_names)
                b2 = tf.get_variable('b2', [1, 100],
                                     initializer=b_initializer,
                                     collections=c_names)
                layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2)

            with tf.variable_scope('layer3'):
                w3 = tf.get_variable('w3', [100, 50],
                                     initializer=w_initializer,
                                     collections=c_names)
                b3 = tf.get_variable('b3', [1, 50],
                                     initializer=b_initializer,
                                     collections=c_names)
                layer3 = tf.nn.relu(tf.matmul(layer2, w3) + b3)

            with tf.variable_scope('layer4'):
                w4 = tf.get_variable('w4', [50, self.a_dim],
                                     initializer=w_initializer,
                                     collections=c_names)
                b4 = tf.get_variable('b4', [1, self.a_dim],
                                     initializer=b_initializer,
                                     collections=c_names)
                self.q_next = tf.matmul(layer3, w4) + b4

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver(max_to_keep=MAX_EPISODES)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0

        transition = np.hstack((s, a, r, s_))
        #        index = self.memory_counter % self.memory_size
        #        self.memory[index, :] = transition
        self.memory.store(transition)
        self.memory_counter += 1

    def choose_action(self, observation):
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            action_value = self.sess.run(self.q_eval,
                                         feed_dict={self.s: observation})
            action = np.argmax(action_value)
        else:
            action = np.random.randint(0, self.a_dim)

        return action

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
#            print('\ntarget_params_replaced\n')

#        if self.memory_counter > self.memory_size:
#            sample_index = np.random.choice(self.memory_size, size = self.batch_size)
#        else:
#            sample_index = np.random.choice(self.memory_counter, size = self.batch_size)

#        batch_memory = self.memory[sample_index, :]

        tree_index, batch_memory, ISWeights = self.memory.sample(batch_size)

        q_eval, q_next = self.sess.run(
            [self.q_eval, self.q_next],
            feed_dict={
                self.s: batch_memory[:, :self.s_dim],
                self.s_: batch_memory[:, -self.s_dim:]
            })
        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.s_dim].astype(int)
        reward = batch_memory[:, self.s_dim + 1]

        q_target[batch_index,
                 eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        abs_td_error = np.abs(q_target[batch_index, eval_act_index] - q_eval[
            batch_index, eval_act_index]) * np.array(ISWeights).flatten()
        self.memory.batch_update(tree_index, abs_td_error)

        _, self.cost = self.sess.run(
            [self.train_op, self.loss],
            feed_dict={
                self.s: batch_memory[:, :self.s_dim],
                self.q_target: q_target,
                self.ISWeights: ISWeights
            })
        self.cost_history.append(self.cost)
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        exploration = self.epsilon
        self.learn_step_counter += 1

        return exploration


#    def plot_cost(self):
#        import matplotlib.pyplot as plt
#        plt.plot(np.arange(len(self.cost_history)), self.cost_history)
#        plt.xlabel('training step')
#        plt.ylabel('cost')
#        plt.show()

    def savemodel(self):
        self.saver.save(self.sess,
                        'Checkpoints/DQN/save_net.ckpt',
                        global_step=step_episode)