class Worker(): def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory) def train(self, rollout, sess, gamma, ISWeights): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] dones = rollout[:, 4] Q_target = sess.run( self.local_Q.Q, feed_dict={self.local_Q.inputs: np.vstack(next_observations)}) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = sess.run(self.local_Q.q_action, feed_dict={ self.local_Q.inputs: np.vstack(next_observations), self.local_Q.actions_q: action_next }) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i] # * (1 - dones[i]) z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) #print q_target_batch isweight = np.zeros((batch_size, N)) for i in range(batch_size): for j in range(N): isweight[i, j] = ISWeights[i] feed_dict = { self.local_Q.inputs: np.vstack(observations), self.local_Q.actions_q: action_now, self.local_Q.q_target: q_target_batch, self.local_Q.ISWeights: isweight } u, l, g_n, v_n, _ = sess.run([ self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms, self.local_Q.var_norms, self.local_Q.apply_grads ], feed_dict=feed_dict) return l / len(rollout), g_n, v_n, Q_target, u def work(self, gamma, sess, coord, saver): global GLOBAL_STEP episode_count = sess.run(self.global_episodes) total_steps = 0 epsilon = 0.2 print("Starting worker " + str(self.number)) best_mean_episode_reward = -float('inf') with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) #episode_buffer = [] episode_reward = 0 episode_step_count = 0 d = False s = self.env.reset() s = process_frame(s) if epsilon > 0.01: epsilon = epsilon * 0.997 while not d: #self.env.render() GLOBAL_STEP += 1 #Take an action using probabilities from policy network output. if random.random() > epsilon: a_dist_list = sess.run( self.local_Q.Q, feed_dict={self.local_Q.inputs: [s]}) a_dist = a_dist_list[0] a = np.argmax(a_dist) else: a = random.randint(0, 5) s1, r, d, _ = self.env.step(a) if d == False: s1 = process_frame(s1) else: s1 = s self.replaymemory.add([s, a, r, s1, d]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 if total_steps % 2 == 0 and d != True and total_steps > 50000: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample( batch_size) l, g_n, v_n, Q_target, u = self.train( episode_buffer, sess, gamma, ISWeights) u = np.mean(u, axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx, u) #sess.run(self.update_local_ops) if d == True: break sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory: if self.name == 'worker_0' and episode_count % 5 == 0: print('\n episode: ', episode_count, 'global_step:', \ GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \ 'epsilon: ', epsilon) print('loss', l, 'Qtargetmean', np.mean(Q_target)) #print 'p_target', p_target if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000: saver.save( sess, self.model_path + '/qr-dqn-' + str(episode_count) + '.cptk') print("Saved Model") mean_reward = np.mean(self.episode_rewards[-100:]) if episode_count > 20 and best_mean_episode_reward < mean_reward: best_mean_episode_reward = mean_reward if self.name == 'worker_0': sess.run(self.increment) #if episode_count%1==0: #print('\r {} {}'.format(episode_count, episode_reward),end=' ') episode_count += 1
class Learner(): def __init__(self, sess, s_size, a_size, scope, queues, trainer): self.queue = queues[0] self.param_queue = queues[1] self.replaymemory = ReplayMemory(100000) self.sess = sess self.learner_net = network(s_size, a_size, scope, 20) self.q = self.learner_net.q self.Q = self.learner_net.Q self.actions_q = tf.placeholder(shape=[None, a_size, N], dtype=tf.float32) self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32) self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32) self.q_actiona = tf.multiply(self.q, self.actions_q) self.q_action = tf.reduce_sum(self.q_actiona, axis=1) self.u = tf.abs(self.q_target - self.q_action) self.loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1)) self.local_vars = self.learner_net.local_vars #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0) self.apply_grads = trainer.apply_gradients( zip(self.gradients, self.local_vars)) self.sess.run(tf.global_variables_initializer()) def run(self, gamma, s_size, a_size, batch_size, env): print('start learning') step, train1 = 0, False epi_q = [] self.env = env while True: if self.queue.empty(): pass else: while not self.queue.empty(): t_error = self.queue.get() step += 1 self.replaymemory.add(t_error) if self.param_queue.empty(): params = self.sess.run(self.local_vars) self.param_queue.put(params) if step >= 10000: train1 = True step = 0 if train1 == True: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample( batch_size) #print 'fadsfdasfadsfa' episode_buffer = np.array(episode_buffer) #print episode_buffer observations = episode_buffer[:, 0] actions = episode_buffer[:, 1] rewards = episode_buffer[:, 2] observations_next = episode_buffer[:, 3] dones = episode_buffer[:, 4] Q_target = self.sess.run(self.Q, feed_dict={ self.learner_net.inputs: np.vstack(observations_next) }) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = self.sess.run(self.q_action, feed_dict={ self.learner_net.inputs: np.vstack(observations_next), self.actions_q: action_next }) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i] z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] * (1 - dones[i]) + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) isweight = np.zeros((batch_size, N)) for i in range(batch_size): for j in range(N): isweight[i, j] = ISWeights[i] feed_dict = { self.q_target: q_target_batch, self.learner_net.inputs: np.vstack(observations), self.actions_q: action_now, self.ISWeights: isweight } l, abs_errors, _ = self.sess.run( [self.loss, self.u, self.apply_grads], feed_dict=feed_dict) #print abs_errors abs_errors = np.mean(abs_errors, axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx, abs_errors)
class Worker(): def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory) def train(self,rollout,sess,gamma,ISWeights): rollout = np.array(rollout) observations = rollout[:,0] actions = rollout[:,1] rewards = rollout[:,2] next_observations = rollout[:,3] dones = rollout[:,4] Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)}) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations), self.local_Q.actions_q:action_next}) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i]# * (1 - dones[i]) z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) #print q_target_batch isweight = np.zeros((batch_size,N)) for i in range(batch_size): for j in range(N): isweight[i,j] = ISWeights[i] feed_dict = {self.local_Q.inputs:np.vstack(observations), self.local_Q.actions_q:action_now, self.local_Q.q_target:q_target_batch, self.local_Q.ISWeights:isweight} u,l,g_n,v_n,_ = sess.run([self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms, self.local_Q.var_norms, self.local_Q.apply_grads],feed_dict=feed_dict) return l/len(rollout), g_n, v_n, Q_target, u def work(self,gamma,sess,coord,saver): global GLOBAL_STEP episode_count = sess.run(self.global_episodes) total_steps = 0 epsilon = 0.2 print ("Starting worker " + str(self.number)) best_mean_episode_reward = -float('inf') with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) #episode_buffer = [] episode_reward = 0 episode_step_count = 0 d = False s = self.env.reset() s = process_frame(s) if epsilon > 0.01: epsilon = epsilon * 0.997 while not d: #self.env.render() GLOBAL_STEP += 1 #Take an action using probabilities from policy network output. if random.random() > epsilon: a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]}) a_dist = a_dist_list[0] a = np.argmax(a_dist) else: a = random.randint(0, 5) s1, r, d, _ = self.env.step(a) if d == False: s1 = process_frame(s1) else: s1 = s self.replaymemory.add([s,a,r,s1,d]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 if total_steps % 2 == 0 and d != True and total_steps > 50000: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size) l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights) u = np.mean(u,axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx,u) #sess.run(self.update_local_ops) if d == True: break sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory: if self.name == 'worker_0' and episode_count % 5 == 0: print('\n episode: ', episode_count, 'global_step:', \ GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \ 'epsilon: ', epsilon) print ('loss', l, 'Qtargetmean', np.mean(Q_target)) #print 'p_target', p_target if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000: saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk') print ("Saved Model") mean_reward = np.mean(self.episode_rewards[-100:]) if episode_count > 20 and best_mean_episode_reward < mean_reward: best_mean_episode_reward = mean_reward if self.name == 'worker_0': sess.run(self.increment) #if episode_count%1==0: #print('\r {} {}'.format(episode_count, episode_reward),end=' ') episode_count += 1