def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True) env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) for i_episode in xrange(FLAGS.num_eval_episodes): state = env.get_initial_state() episode_reward = 0 done = False # create state sequence state_sequence = np.zeros((t_max, FLAGS.history_length, FLAGS.width, FLAGS.height)) state_sequence[t_max -1, :, :, :] = state while not done: monitor_env.render() q_values = self.q_values.eval(session = self.session, feed_dict = {self.state : [state_sequence]}) action_index = np.argmax(q_values) new_state, reward, done = env.step(action_index) state = new_state # update state sequence state_sequence = np.delete(state_sequence, 0, 0) state_sequence = np.insert(state_sequence, t_max-1, state, 0) episode_reward += reward print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward) monitor_env.monitor.close()
def test(self, env): # initialize environment env = Env(env, 84, 84, 4) terminal = False # Get initial game observation state = env.get_initial_state() # episode's reward and cost episode_reward = 0 for _ in range(100): while not terminal: # forward pass of network. Get probability of all actions probs, v = self.sess.run((self.policy, self.state_value), feed_dict={self.input_state: [state]}) probs = probs[0] v = v[0][0] if random.random() < 0.01: action_index = random.choice([0, 1, 2, 3]) else: action_index = np.argmax(probs) # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, terminal = env.step(action_index) env.env.render() # clip reward to -1, 1 # Update the state and global counters state = new_state # update episode's counter episode_reward += reward if terminal: terminal = False print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \ episode_reward, "/ COST" episode_reward = 0 counter = 0 # Get initial game observation state = env.get_initial_state()
def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True) env = Env(monitor_env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) for i_episode in xrange(FLAGS.num_eval_episodes): state = env.get_initial_state() episode_reward = 0 done = False while not done: monitor_env.render() probs = self.session.run(self.policy_values, feed_dict={self.state: [state]})[0] action_index = sample_policy_action(num_actions, probs) new_state, reward, done = env.step(action_index) state = new_state episode_reward += reward print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward) monitor_env.monitor.close()
def actor_learner_thread(self, env, thread_id, num_actions): # create instance of Doom environment env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) print 'Starting thread ' + str(thread_id) time.sleep(3*thread_id) # Get initial game observation state = env.get_initial_state() # episode's counter episode_reward = 0 counter = 0 while self.T < self.TMAX: done = False # clear gradients states = [] actions = [] prev_reward = [] t = 0 t_start = t # synchronize policy and value network self.session.run(self.update_policy[thread_id]) self.session.run(self.update_value[thread_id]) while not (done or ((t - t_start) == t_max)): # forward pass of network. Get probability of all actions probs = self.session.run(self.local_policy[thread_id], feed_dict={self.local_states[thread_id]: [state]})[0] # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([num_actions]) # choose action based on policy action_index = sample_policy_action(num_actions, probs) action_list[action_index] = 1 # add state and action to list actions.append(action_list) states.append(state) # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, done = env.step(action_index) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) prev_reward.append(clipped_reward) # Update the state and global counters state = new_state self.T += 1 t += 1 counter += 1 # update episode's counter episode_reward += reward # Save model progress if counter % FLAGS.checkpoint_interval == 0: if FLAGS.game_type == 'Doom': self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game.split("/")[1] + ".ckpt" , global_step = counter) else: self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game + ".ckpt" , global_step = counter) if done: R_t = 0 else: R_t = self.session.run(self.local_value[thread_id], feed_dict = {self.local_states[thread_id] : [state]})[0][0] targets = np.zeros((t - t_start)) for i in range(t - t_start -1 , -1, -1): R_t = prev_reward[i] + FLAGS.gamma * R_t targets[i] = R_t #update q value network self.session.run(self.grad_update, feed_dict = {self.state: states, self.actions: actions, self.targets: targets}) if done: print "THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", counter, "/ REWARD", episode_reward episode_reward = 0 # Get initial game observation state = env.get_initial_state()
def train(self, env, checkpoint_interval, checkpoint_dir, saver, gamma=0.99): global T self.saver = saver # initialize environment time.sleep(3 * self.thread_id) env = Env(env, 84, 84, 4) print 'Starting thread ' + str(self.thread_id) terminal = False # Get initial game observation state = env.get_initial_state() # episode's reward and cost episode_reward = 0 total_cost = 0 counter = 0 while T < self.TMAX: # lists for feeding placeholders states = [] actions = [] prev_reward = [] state_values = [] t = 0 t_start = t self.sess.run(self.sync_op) while not (terminal or ((t - t_start) == self.tmax)): # forward pass of network. Get probability of all actions probs, v = self.sess.run((self.policy, self.state_value), feed_dict={self.input_state: [state]}) probs = probs[0] v = v[0][0] # print the outputs of the neural network fpr sanity chack if T % 2000 == 0: print probs print v # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([self.output_size]) # choose action based on policy action_index = sample_policy_action(probs) action_list[action_index] = 1 # add state and action to list actions.append(action_list) states.append(state) state_values.append(v) # Gym executes action in game environment on behalf of actor-learner new_state, reward, terminal = env.step(action_index) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) prev_reward.append(clipped_reward) # Update the state and global counters state = new_state T += 1 t += 1 counter += 1 # update episode's counter episode_reward += reward # Save model progress if T % checkpoint_interval < 200: T += 200 self.saver.save(self.sess, checkpoint_dir + "/breakout.ckpt", global_step=T) if terminal: R_t = 0 else: R_t = self.sess.run(self.state_value, feed_dict={self.input_state: [state]}) R_t = R_t[0][0] state_values.append(R_t) targets = np.zeros((t - t_start)) for i in range(t - t_start - 1, -1, -1): R_t = prev_reward[i] + gamma * R_t targets[i] = R_t # compute the advantage based on GAE # code from https://github.com/openai/universe-starter-agent delta = np.array(prev_reward) + gamma * np.array( state_values[1:]) - np.array(state_values[:-1]) advantage = scipy.signal.lfilter([1], [1, -gamma], delta[::-1], axis=0)[::-1] # update the global network cost, _ = self.sess.run( (self.loss, self.opt), feed_dict={ self.input_state: states, self.actions: actions, self.targets: targets, self.advantage: advantage }) total_cost += cost if terminal: terminal = False print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \ episode_reward, "/ COST", total_cost/counter episode_reward = 0 total_cost = 0 counter = 0 # Get initial game observation state = env.get_initial_state()
def actor_learner_thread(self, env, thread_id, num_actions): # create instance of Doom environment env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) # Initialize network gradients states = [] actions = [] targets = [] initial_epsilon = 1 epsilon = 1 final_epsilon = self.sample_final_epsilon() print('Starting thread ' + str(thread_id) + ' with final epsilon ' + str(final_epsilon)) time.sleep(3 * thread_id) t = 0 while self.T < self.TMAX: # Get initial game observation state = env.get_initial_state() done = False # episode's counter episode_reward = 0 mean_q = 0 frames = 0 while not done: # forward pass of network. Get Q(s,a) q_values = self.q_values.eval(session=self.session, feed_dict={self.state: [state]}) # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([num_actions]) action_index = 0 # chose action based on current policy if random.random() <= epsilon: action_index = random.randrange(num_actions) else: action_index = np.argmax(q_values) action_list[action_index] = 1 # reduce epsilon if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps # decrease learning rate if self.lr > 0: self.lr -= FLAGS.learning_rate / self.TMAX # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, done = env.step(action_index) # forward pass of target network. Get Q(s',a) target_q_values = self.target_q_values.eval(session=self.session, feed_dict={self.new_state: [new_state]}) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) # compute targets based on Q-learning update rule # targets = r + gamma*max(Q(s',a)) if done: targets.append(clipped_reward) else: targets.append(clipped_reward + FLAGS.gamma * np.max(target_q_values)) actions.append(action_list) states.append(state) # Update the state and global counters state = new_state self.T += 1 t += 1 # update episode's counter frames += 1 episode_reward += reward mean_q += np.max(q_values) # update_target_network if self.T % FLAGS.target_network_update_frequency == 0: self.session.run(self.update_target) # train online network if t % FLAGS.network_update_frequency == 0 or done: if states: self.session.run(self.grad_update, feed_dict={self.state: states, self.actions: actions, self.targets: targets, self.learning_rate: self.lr}) # Clear gradients states = [] actions = [] targets = [] # Save model progress if t % FLAGS.checkpoint_interval == 0: if FLAGS.game_type == 'Doom': self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game.split("/")[1] + ".ckpt", global_step=t) else: self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game + ".ckpt", global_step=t) # Print end of episode stats if done: print("THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", episode_reward, "/ Q_MAX %.4f" % (mean_q / float(frames)), "/ EPSILON PROGRESS", t / float(FLAGS.anneal_epsilon_timesteps)) break
def actor_learner_thread(self, env, thread_id, num_actions): # create instance of Doom environment env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) initial_epsilon = 1 epsilon = 1 final_epsilon = self.sample_final_epsilon() print 'Starting thread ' + str( thread_id) + ' with final epsilon ' + str(final_epsilon) time.sleep(3 * thread_id) # Get initial game observation state = env.get_initial_state() # episode's counter episode_reward = 0 mean_q = 0 frames = 0 counter = 0 while self.T < self.TMAX: done = False # clear gradients states = [] actions = [] targets = [] prev_reward = [] t = 0 t_start = t self.session.run(self.update_local_model[thread_id]) while not (done or ((t - t_start) == t_max)): # forward pass of network. Get Q(s,a) q_values = self.local_values[thread_id].eval( session=self.session, feed_dict={self.local_states[thread_id]: [state]}) # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([num_actions]) # chose action based on current policy if random.random() <= epsilon: action_index = random.randrange(num_actions) else: action_index = np.argmax(q_values) action_list[action_index] = 1 # add state and action to list actions.append(action_list) states.append(state) # reduce epsilon if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, done = env.step(action_index) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) prev_reward.append(clipped_reward) # Update the state and global counters state = new_state self.T += 1 t += 1 counter += 1 # update episode's counter frames += 1 episode_reward += reward mean_q += np.max(q_values) # update_target_network if self.T % FLAGS.target_network_update_frequency == 0: print "Target Network Updated" self.session.run(self.update_target) # Save model progress if self.T % FLAGS.checkpoint_interval < 400: self.T += 400 if FLAGS.game_type == 'Doom': self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game.split("/")[1] + ".ckpt", global_step=self.T) else: self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game + ".ckpt", global_step=self.T) if done: R_t = 0 else: R_t = np.max( self.target_q_values.eval( session=self.session, feed_dict={self.new_state: [state]})) targets = np.zeros((t - t_start)) for i in range(t - t_start - 1, -1, -1): R_t = prev_reward[i] + FLAGS.gamma * R_t targets[i] = R_t #update q value network self.session.run(self.grad_update[thread_id], feed_dict={ self.state: states, self.actions: actions, self.targets: targets }) if done: print "THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", counter, "/ EPSILON", epsilon, "/ REWARD", episode_reward, "/ Q_MAX %.4f" % ( mean_q / float(frames)), "/ EPSILON PROGRESS", counter / float( FLAGS.anneal_epsilon_timesteps) episode_reward = 0 file_path = 'rewards' try: with open(file_path, 'a+') as f: f.write( str(episode_reward) + ', ' + str((mean_q / float(frames))) + '\n') except IOError: with open(file_path, 'w+') as f: f.write( str(episode_reward) + ', ' + str((mean_q / float(frames))) + '\n') f.close() # Get initial game observation episode_reward = 0 mean_q = 0 frames = 0 state = env.get_initial_state()