class Ddpg_Agent(): def __init__(self, params): self.env = gym.make('CartPole-v0') self.params = params self.graph = tf.Graph() with self.graph.as_default(): self.main_actor = Policy_network(params, "primary") tvars = tf.trainable_variables() tact_start_index = int(len(tvars)) self.target_actor = Policy_network(params, "target") tvars = tf.trainable_variables() mcri_start_index = int(len(tvars)) self.main_critic = Value_network(params, "primary") tvars = tf.trainable_variables() tcri_start_index = int(len(tvars)) self.target_critic = Value_network(params, "target") self.tvars = tf.trainable_variables() self.main_actor_tvars = self.tvars[:tact_start_index] self.target_actor_tvars = self.tvars[ tact_start_index:mcri_start_index] self.main_critic_tvars = self.tvars[ mcri_start_index:tcri_start_index] self.target_critic_tvars = self.tvars[tcri_start_index:] self.main_actor.backprop(self.main_actor_tvars) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver() if not os.path.exists(self.params.logdir): os.mkdir(self.params.logdir) self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size) self.running_reward = None self.reward_sum = 0 self.global_step = 0 self.actor_targetOps = self.update_TargetGraph(self.main_actor_tvars, self.target_actor_tvars, self.params.tau) self.critic_targetOps = self.update_TargetGraph( self.main_critic_tvars, self.target_critic_tvars, self.params.tau) def update_TargetGraph(self, main_tfVar, target_tfVar, tau): '''Holds operation node for assigning Target values to Target network Args: tfVars - Variables for training(weights, bias...) Tau - rate for updating (low Tau value for slow updates) Return: op_holder - tf.assign() operation. input for updateTarget Function''' assert len(main_tfVar) == len(target_tfVar) total_vars = len(main_tfVar) op_holder = [] # for latter-half part of trainable variables (= for Target network variables) for idx, var in enumerate(main_tfVar[0:total_vars]): # assigning tau*new_value+(1-tau)*old_values op_holder.append(target_tfVar[idx].assign((var.value() * tau) + ( (1 - tau) * target_tfVar[idx].value()))) return op_holder def update_Target(self, op_holder, sess): '''run operation defined in updateTargetGraph function''' for op in op_holder: sess.run(op) def _load_model(self, sess, load_ckpt): if load_ckpt: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.params.logdir) self.saver.restore(sess, ckpt.model_checkpoint_path) else: # initialize gloabl variables print('Initialize variables...') sess.run(self.init) def train(self): with tf.Session(graph=self.graph) as sess: self._load_model(sess, self.params.load_model) self.total_episodes = self.params.total_episodes # Obtain an initial observation of the environment state = self.env.reset() state_input = state.reshape([1, self.params.input_dim]) for episode_number in xrange(self.params.total_episodes): done = False score = 0 while not done: if self.global_step > self.params.preTrainStep: # Value network update trainBatch = self.myBuffer.sample( self.params.batch_size) batch_state = np.array(trainBatch[0]).reshape( [self.params.batch_size, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape( [self.params.batch_size, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape( [self.params.batch_size, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) target_action = sess.run(self.target_actor.det_prob, feed_dict={ self.target_actor.input_x: batch_next_state }) target_action = np.array([[1, 0] if i == 0 else [0, 1] for i in target_action]) targetQ_all = sess.run(self.target_critic.Qout, feed_dict={ self.target_critic.input_x: batch_next_state, self.target_critic.actions: target_action }) nextQ = np.sum(np.multiply(targetQ_all, target_action), axis=-1) targetQ = batch_rewards + (self.params.gamma * nextQ * end_multiplier) pred_actions = sess.run( self.main_actor.det_prob, feed_dict={self.main_actor.input_x: batch_state}) pred_actions = np.array([[1, 0] if i == 0 else [0, 1] for i in pred_actions]) # Update the network with our target values. sess.run(self.main_critic.update_value_model, feed_dict={ self.main_critic.input_x: batch_state, self.main_critic.target_Q: targetQ, self.main_critic.actions: batch_actions }) self.update_Target(self.critic_targetOps, sess) gradients = sess.run(self.main_critic.action_grads, feed_dict={ self.main_critic.input_x: batch_state, self.main_critic.actions: pred_actions }) gradients = np.array(gradients).reshape( self.params.batch_size, self.params.num_actions) sess.run(self.main_actor.optimize, feed_dict={ self.main_actor.input_x: batch_state, self.main_actor.action_gradient: gradients }) self.update_Target(self.actor_targetOps, sess) # Make sure the observation is in a shape the network can handle. state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] actor_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.params.num_actions)) action = sess.run(self.main_actor.logits, feed_dict={ self.main_actor.input_x: state_input }) + actor_noise() action = np.argmax(action) # step the environment and get new measurements next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append( reward if not done or score == 299 else -100) #reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(done) # move to next state state_input = next_state # add up reward self.reward_sum += reward score += reward self.global_step += 1 self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) if episode_number % self.params.update_freq == 0: self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01 print( 'Current Episode {} Average reward for episode {:.2f}. Total average reward {:.2f}.' .format(episode_number, self.reward_sum // self.params.update_freq, self.running_reward // self.params.update_freq)) self.reward_sum = 0 time.sleep(0.5) self.state = self.env.reset() state_input = self.state.reshape([1, self.params.input_dim]) self.global_step += 1
class Worker(): def __init__(self, params, num, global_episodes, tvars, global_network): self.params = params self.name = "worker_" + str(num) self.number = num self.model_path = self.params.logdir self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) self.global_network = global_network #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_AC = AC_network(params, num, tvars, name=self.name) self.update_local_ops = self.update_target_graph( tvars, self.local_AC.local_vars) #The Below code is related to setting up the Doom environment self.actions = None #load cartpole self.env = gym.make('CartPole-v0') self.myBuffer = ReplayMemory(max_size=self.params.max_ep_length) def train(self, sess): trainBatch = self.myBuffer.sample(self.total_steps) batch_state = np.array(trainBatch[0]).reshape( [self.total_steps, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape( [self.total_steps, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape( [self.total_steps, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) # Here we take the rewards and values from the buffer, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" #self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) #discounted_rewards = discount(self.rewards_plus,gamma)[:-1] #self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) #advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] #advantages = discount(advantages,gamma) next_Q = np.max( sess.run(self.local_AC.Qout, feed_dict={self.local_AC.input_x: batch_next_state})) state_value = np.max( sess.run(self.local_AC.Qout, feed_dict={self.local_AC.input_x: batch_state})) batch_target_Q = batch_rewards + (self.params.gamma * next_Q * end_multiplier) batch_advantages = batch_target_Q - state_value # Update the global network using gradients from loss # Generate network statistics to periodically save feed_dict = { self.local_AC.input_x: batch_state, self.local_AC.target_Q: batch_target_Q, self.local_AC.actions: batch_actions, self.local_AC.advantages: batch_advantages.reshape(self.total_steps, 1) } v_l, p_l, e_l, _ = sess.run([ self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.apply_grads ], feed_dict=feed_dict) #return v_l/self.total_steps , p_l/self.total_steps , e_l/self.total_steps def work(self, sess, coord, saver): episode_count = sess.run(self.global_episodes) self.total_steps = 0 print("Starting worker " + str(self.number)) with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = [] episode_step_count = [] score = 0 d = False state_input = self.env.reset() state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] while not d: state_input = state_input.reshape( [1, self.params.input_dim]) # Run the policy network and get an action to take. curr_policy = sess.run( self.local_AC.probability, feed_dict={self.local_AC.input_x: state_input}) # get the action from predicted policy action = np.random.choice(np.arange(len(curr_policy)), p=curr_policy) # step the environment and get new measurements next_state, reward, d, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append( reward if not d or score == 399 else -200) # reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(d) score += reward self.total_steps += 1 state_input = next_state self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) #state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] episode_reward.append(score) #print(score) episode_step_count.append(self.total_steps) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append(np.mean(episode_values)) # Update the network using the episode buffer at the end of the episode. if self.myBuffer != None: #v_l,p_l,e_l = self.train(sess) self.train(sess) # #print(v_l, p_l, e_l) self.update_Target(self.update_local_ops, sess) #print(myBuffer._memory) self.myBuffer.reset() self.total_steps = 0 # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 10 == 0 and episode_count != 0: if episode_count % 100 == 0 and self.name == 'worker_0': saver.save( sess, self.model_path + '/model-' + str(episode_count) + '.cptk') print("Saved Model") if self.name == "worker_0": curr_reward = 0 for i in range(5): test_done = False state = self.env.reset() while not test_done: state = state.reshape(1, self.params.input_dim) curr_policy = sess.run( self.global_network.probability, feed_dict={ self.global_network.input_x: state }) # get the action from predicted policy action = np.random.choice(np.arange( len(curr_policy)), p=curr_policy) # step the environment and get new measurements next_state, reward, test_done, _ = self.env.step( action) curr_reward += 1 state = next_state print("Episode: {}, Current global reward: {:.1f}". format(episode_count, curr_reward / 5)) time.sleep(0.5) if self.name == 'worker_0': sess.run(self.increment) episode_count += 1 if episode_count > self.params.total_episodes and self.name == "worker_0": coord.request_stop() def update_target_graph(self, from_vars, to_vars): op_holder = [] for from_var, to_var in zip(from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder def update_Target(self, op_holder, sess): '''run operation defined in updateTargetGraph function''' for op in op_holder: sess.run(op)
class AC_Agent(): def __init__(self, params): self.env = gym.make('CartPole-v0') #self.env = gym.make('Pong-v0') self.params = params self.graph = tf.Graph() with self.graph.as_default(): self.actor = Policy_network(params) self.main_critic = Value_network(params, "primary") self.target_critic = Value_network(params, "target") self.init = tf.global_variables_initializer() if not os.path.exists(self.params.logdir): os.mkdir(self.params.logdir) self.saver = tf.train.Saver() self.tvars = tf.trainable_variables() main_start_index = int(len(self.tvars)/3) target_start_index = int(2*len(self.tvars)/3) self.actor_tvars = self.tvars[:main_start_index] self.main_critic_tvars = self.tvars[main_start_index:target_start_index] self.target_critic_tvars = self.tvars[target_start_index:] #self.actor.backprop(tvars=None) self.running_reward = None self.reward_sum = 0 self.episode_number = 0 rendering = False self.global_step = 0 self.critic_targetOps = self.update_critic_TargetGraph(self.main_critic_tvars, self.target_critic_tvars, self.params.tau) self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size) def update_critic_TargetGraph(self, main_tfVar, target_tfVar, tau): '''Holds operation node for assigning Target values to Target network Args: tfVars - Variables for training(weights, bias...) Tau - rate for updating (low Tau value for slow updates) Return: op_holder - tf.assign() operation. input for updateTarget Function''' assert len(main_tfVar) == len(target_tfVar) total_vars = len(main_tfVar) op_holder = [] # for latter-half part of trainable variables (= for Target network variables) for idx, var in enumerate(main_tfVar[0:total_vars]): # assigning tau*new_value+(1-tau)*old_values op_holder.append(target_tfVar[idx].assign( (var.value() * tau) + ((1 - tau) * target_tfVar[idx].value()))) return op_holder def update_critic_Target(self, op_holder, sess): '''run operation defined in updateTargetGraph function''' for op in op_holder: sess.run(op) def _load_model(self, sess, load_ckpt): if load_ckpt: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.params.logdir) self.saver.restore(sess, ckpt.model_checkpoint_path) else: # initialize gloabl variables print('Initialize variables...') sess.run(self.init) def rendering(self, rendering): if self.reward_sum / self.params.update_freq >= 180 or rendering == True : self.env.render() rendering = True def train(self): with tf.Session(graph=self.graph) as sess: self._load_model(sess, self.params.load_model) self.total_episodes = self.params.total_episodes # Obtain an initial observation of the environment self.state = self.env.reset() #state_input = self.prepro(self.state) state_input = self.state.reshape([1, self.params.input_dim]) for self.episode_number in xrange(self.params.total_episodes): done = False score = 0 while not done: if self.global_step > self.params.preTrainStep: #print(self.myBuffer) # Value network update trainBatch = self.myBuffer.sample(self.params.batch_size) #print(trainBatch) batch_state = np.array(trainBatch[0]).reshape([self.params.batch_size, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape([self.params.batch_size, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape([self.params.batch_size, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) targetQ_all = sess.run(self.target_critic.Qout, feed_dict={self.target_critic.input_x: batch_next_state}) targetQ = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier) predictedQ_all = sess.run(self.main_critic.Qout, feed_dict={self.main_critic.input_x: batch_state}) # Update the network with our target values. sess.run(self.main_critic.update_value_model, feed_dict={self.main_critic.input_x : batch_state, self.main_critic.target_Q : targetQ, self.main_critic.actions : batch_actions}) self.update_critic_Target(self.critic_targetOps, sess) batch_advantage = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier) - np.max(predictedQ_all) # Policy network update batch_advantage = batch_advantage.reshape([self.params.batch_size, 1]) sess.run(self.actor.optimize, feed_dict={self.actor.input_x: batch_state, self.actor.input_y: batch_actions, self.actor.advantages: batch_advantage}) # Make sure the observation is in a shape the network can handle. state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] #print(state_input.shape) #prev_state = state_input # Run the policy network and get an action to take. curr_policy = sess.run(self.actor.probability, feed_dict={self.actor.input_x: state_input}) # get the action from predicted policy action = np.random.choice(np.arange(len(curr_policy)), p=curr_policy) # step the environment and get new measurements next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) #next_state = self.prepro(next_state) #next_state = next_state - prev_state state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append(reward if not done or score == 299 else -100) #reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(done) state_input = next_state # move to next state # add up reward self.reward_sum += reward score += reward self.global_step += 1 self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) if self.episode_number % self.params.update_freq == 0: self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01 print('Current Episode {} Average reward for episode {:.2f}. Total average reward {:.2f}.' .format(self.episode_number, self.reward_sum // self.params.update_freq, self.running_reward // self.params.update_freq)) self.reward_sum = 0 time.sleep(0.5) self.state = self.env.reset() state_input = self.state.reshape([1, self.params.input_dim]) #state_input = self.prepro(self.state) self.global_step += 1