Python ReplayMemory.update_priorities Examples

Programming Language: Python

Namespace/Package Name: replaymemory

Class/Type: ReplayMemory

Method/Function: update_priorities

Examples at hotexamples.com: 3

Python ReplayMemory.update_priorities - 3 examples found. These are the top rated real world Python examples of replaymemory.ReplayMemory.update_priorities extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReplayMemory(20)

sample(7)

add(4)

getCurrentIndex(3)

push(3)

pushFrame(3)

pushTransition(3)

sampleTransition(3)

sample_buffer(3)

store_transition(3)

update_priorities(2)

addMemory(1)

append(1)

getRandomMemories(1)

Example #1

Show file

File: qrdqn-offline.py Project: LihaoR/c51-qr-dqn

class Worker():
    def __init__(self, env, name, s_size, a_size, trainer, model_path,
                 global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)

    def train(self, rollout, sess, gamma, ISWeights):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        dones = rollout[:, 4]

        Q_target = sess.run(
            self.local_Q.Q,
            feed_dict={self.local_Q.inputs: np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)

        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action,
                            feed_dict={
                                self.local_Q.inputs:
                                np.vstack(next_observations),
                                self.local_Q.actions_q: action_next
                            })
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]  # * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size, N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i, j] = ISWeights[i]
        feed_dict = {
            self.local_Q.inputs: np.vstack(observations),
            self.local_Q.actions_q: action_now,
            self.local_Q.q_target: q_target_batch,
            self.local_Q.ISWeights: isweight
        }
        u, l, g_n, v_n, _ = sess.run([
            self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms,
            self.local_Q.var_norms, self.local_Q.apply_grads
        ],
                                     feed_dict=feed_dict)
        return l / len(rollout), g_n, v_n, Q_target, u

    def work(self, gamma, sess, coord, saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2

        print("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(
                            self.local_Q.Q,
                            feed_dict={self.local_Q.inputs: [s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)

                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s, a, r, s1, d])
                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                            batch_size)
                        l, g_n, v_n, Q_target, u = self.train(
                            episode_buffer, sess, gamma, ISWeights)
                        u = np.mean(u, axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx, u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)

                    print('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(
                            sess, self.model_path + '/qr-dqn-' +
                            str(episode_count) + '.cptk')
                        print("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                    #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1

Example #2

Show file

File: learner_mp.py Project: LihaoR/Apex-dqn

class Learner():
    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())

    def run(self, gamma, s_size, a_size, batch_size, env):
        print('start learning')
        step, train1 = 0, False
        epi_q = []
        self.env = env
        while True:
            if self.queue.empty():
                pass
            else:
                while not self.queue.empty():
                    t_error = self.queue.get()
                    step += 1
                    self.replaymemory.add(t_error)

            if self.param_queue.empty():
                params = self.sess.run(self.local_vars)
                self.param_queue.put(params)

            if step >= 10000:
                train1 = True
                step = 0

            if train1 == True:
                episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                    batch_size)
                #print 'fadsfdasfadsfa'
                episode_buffer = np.array(episode_buffer)
                #print episode_buffer
                observations = episode_buffer[:, 0]
                actions = episode_buffer[:, 1]
                rewards = episode_buffer[:, 2]
                observations_next = episode_buffer[:, 3]
                dones = episode_buffer[:, 4]
                Q_target = self.sess.run(self.Q,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next)
                                         })

                actions_ = np.argmax(Q_target, axis=1)

                action = np.zeros((batch_size, a_size))
                action_ = np.zeros((batch_size, a_size))
                for i in range(batch_size):
                    action[i][actions[i]] = 1
                    action_[i][actions_[i]] = 1
                action_now = np.zeros((batch_size, a_size, N))
                action_next = np.zeros((batch_size, a_size, N))
                for i in range(batch_size):
                    for j in range(a_size):
                        for k in range(N):
                            action_now[i][j][k] = action[i][j]
                            action_next[i][j][k] = action_[i][j]
                q_target = self.sess.run(self.q_action,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next),
                                             self.actions_q:
                                             action_next
                                         })

                q_target_batch = []
                for i in range(len(q_target)):
                    qi = q_target[i]
                    z_target_step = []
                    for j in range(len(qi)):
                        z_target_step.append(gamma * qi[j] * (1 - dones[i]) +
                                             rewards[i])
                    q_target_batch.append(z_target_step)
                q_target_batch = np.array(q_target_batch)

                isweight = np.zeros((batch_size, N))
                for i in range(batch_size):
                    for j in range(N):
                        isweight[i, j] = ISWeights[i]
                feed_dict = {
                    self.q_target: q_target_batch,
                    self.learner_net.inputs: np.vstack(observations),
                    self.actions_q: action_now,
                    self.ISWeights: isweight
                }

                l, abs_errors, _ = self.sess.run(
                    [self.loss, self.u, self.apply_grads], feed_dict=feed_dict)
                #print abs_errors
                abs_errors = np.mean(abs_errors, axis=1) + 1e-6

                self.replaymemory.update_priorities(tree_idx, abs_errors)

Example #3

Show file

File: qrdqn-offline.py Project: LihaoR/c51-qr-dqn

class Worker():
    def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)
        
    def train(self,rollout,sess,gamma,ISWeights):
        rollout = np.array(rollout)
        observations      = rollout[:,0]
        actions           = rollout[:,1]
        rewards           = rollout[:,2]
        next_observations = rollout[:,3]
        dones             = rollout[:,4]
        
        Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)
        
        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations),
                                                               self.local_Q.actions_q:action_next})
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]# * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size,N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i,j] = ISWeights[i]
        feed_dict = {self.local_Q.inputs:np.vstack(observations),
                     self.local_Q.actions_q:action_now,
                     self.local_Q.q_target:q_target_batch,
                     self.local_Q.ISWeights:isweight}
        u,l,g_n,v_n,_ = sess.run([self.local_Q.u,
                                  self.local_Q.loss,
                                  self.local_Q.grad_norms,
                                  self.local_Q.var_norms,
                                  self.local_Q.apply_grads],feed_dict=feed_dict)
        return l/len(rollout), g_n, v_n, Q_target, u

    def work(self,gamma,sess,coord,saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2
        
        print ("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)
                    
                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s,a,r,s1,d])
                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size)
                        l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights)
                        u = np.mean(u,axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx,u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)
                    
                    print ('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                        #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1