def play(sess, agent, no_plays, log_dir=None, show_ui=False, show_action=False): """ Use a trained agent to play a required number of games :param sess: op, session instance from tensorflow :param agent: tensor, trained agent structure/graph :param no_plays: int, you get it :param log_dir: string, place to store the log files during gameplay :param show_ui: bool, True -> Show game screen False -> Should I explain this? :param show_action: bool, True -> Show the actions taken by the trained agent False -> Hmm, what can this be? :return: just prints the results with nothing being returned """ rewards = [] for p in range(no_plays): observation = env.reset() observation = ops.convert_to_gray_n_resize(observation) observation = np.expand_dims(observation, axis=2) state = np.repeat(observation, 4, axis=2) state = np.expand_dims(state, axis=0) done = False reward = 0 while not done: if show_ui: env.render() if np.random.rand() < 0.05: action = env.action_space.sample() else: action = np.argmax(sess.run(agent, feed_dict={X_input: state})) if show_action: print(action) new_state, r, done, _ = env.step(action) next_state = ops.convert_to_gray_n_resize(new_state) next_state = np.expand_dims(next_state, axis=2) next_state = np.expand_dims(next_state, axis=0) state = np.append(next_state, state[:, :, :, :3], axis=3) reward += r rewards.append(reward) print("Game: {}/{}".format(p + 1, no_plays)) print("Reward: {}".format(reward)) if not log_dir is None: with open(log_dir + "/log.txt", "a") as log_file: log_file.write("Game: {}/{}\n".format(p + 1, no_plays)) log_file.write("Reward: {}\n".format(reward)) print( "------------------------------------------------------------------------------------------------------" ) print("Best reward: {}".format(np.amax(rewards))) print("Average reward: {}".format(np.mean(rewards))) if not log_dir is None: with open(log_dir + "/log.txt", "a") as log_file: log_file.write("Best reward: {}\n".format(np.amax(rewards))) log_file.write("Average reward: {}\n".format(np.mean(rewards)))
def collect_rand_observations(replay_memory, sess=None, agent=None): """ Collects mc.rand_observation_time number of random observations and stores them in deque :param replay_memory: deque, deque instance :param agent: Tensor op, the agent architecture :param sess: op, the restored session to restore :return: ndarray, stored as follows: (state, action, reward, next_states, done, life_lost) """ print("Collecting Random Observations") observation = env.reset() observation = ops.convert_to_gray_n_resize(observation) observation = np.expand_dims(observation, axis=2) state = np.repeat(observation, 4, axis=2) state = np.expand_dims(state, axis=0) lives_left = 5 if len(replay_memory) < mc.rand_observation_time: for i in range(int(mc.rand_observation_time)): if sess is None: action = env.action_space.sample() else: q_prediction = sess.run(agent, feed_dict={X_input: state}) action = np.argmax(q_prediction) next_state, reward, done, info = env.step(action) next_state = ops.convert_to_gray_n_resize(next_state) next_state = np.expand_dims(next_state, axis=2) next_state = np.expand_dims(next_state, axis=0) next_states = np.append(next_state, state[:, :, :, :3], axis=3) life_lost = 0 if lives_left - info['ale.lives'] > 0: life_lost = 1 lives_left -= 1 replay_memory.append( (state, action, reward, next_states, done, life_lost)) state = next_states if done: lives_left = 5 observation = env.reset() observation = ops.convert_to_gray_n_resize(observation) observation = np.expand_dims(observation, axis=2) state = np.repeat(observation, 4, axis=2) state = np.expand_dims(state, axis=0) print("\rRandom Observation: {}/{}".format( i + 1, mc.rand_observation_time), end="") sys.stdout.flush() return replay_memory
def training_data(self): # TODO: Remove the first 5 to 10 frames from each episode? train_input = [] train_action = [] train_target = [] episode_dir = sorted([ self.data_dir + "/train/" + p for p in os.listdir(self.data_dir + "/train/") ]) n_episodes = len(episode_dir) print("Reading training images!") for e_i, episode in enumerate(episode_dir): print("Reading training image from episode: {}/{}".format( e_i + 1, n_episodes)) frames = sorted( [f for f in os.listdir(episode) if f.endswith(".png")]) with open(episode + "/action.txt") as action_file: action_log = action_file.read() train_action.extend( [int(a) for i, a in enumerate(action_log.split("\n")[3:-1])]) # TODO: Using this for grayscale images only for f_indx in range(len(frames)): frames_to_use = frames[f_indx:f_indx + 5] if len(frames_to_use) < 5: continue for i, f in enumerate(frames_to_use): img = ops.convert_to_gray_n_resize( np.array(Image.open(episode + "/" + f))) img = np.expand_dims(img, axis=2) if i == 0: train_frames = img.copy() elif i < 4: train_frames = np.append(train_frames, img, axis=2) else: train_target.append(img) train_input.append(train_frames) print("Input dataset constructed") train_input = np.array(train_input).reshape( [-1, 84, 84, 4]) # the last 4 appended frames are useless train_action = np.array(train_action).reshape([-1, 1]) train_target = np.array(train_target).reshape([-1, 84, 84, 1]) return train_input, train_action, train_target
def play_n_collect(sess, agent, no_plays, log_dir=None, show_ui=False, show_action=False): """ Use a trained agent to play a required number of games :param sess: op, session instance from tensorflow :param agent: tensor, trained agent structure/graph :param no_plays: int, you get it :param log_dir: string, place to store the log files during gameplay :param show_ui: bool, True -> Show game screen False -> Should I explain this? :param show_action: bool, True -> Show the actions taken by the trained agent False -> Hmm, what can this be? :return: just prints the results with nothing being returned """ rewards = [] main_dir, train_dir, test_dir = make_directories() step = 0 for p in range(no_plays): frame = 0 observation = env.reset() if p < 1000: # Save the first image episode_path = train_dir + "/{:05d}".format(p) else: episode_path = test_dir + "/{:05d}".format(p % 1000) os.mkdir(episode_path) plt.imsave(arr=observation, fname=episode_path + "/{:06d}.png".format(frame)) observation = ops.convert_to_gray_n_resize(observation) observation = np.expand_dims(observation, axis=2) state = np.repeat(observation, 4, axis=2) state = np.expand_dims(state, axis=0) done = False reward = 0 while not done: if show_ui: env.render() if np.random.rand() < 0.07: action = env.action_space.sample() else: action = np.argmax(sess.run(agent, feed_dict={X_input: state})) # Save the action taken with open(episode_path + "/action.txt", "a") as log: log.write("{}\n".format(action)) if show_action: print(action) frame += 1 step += 1 new_state, r, done, _ = env.step(action) plt.imsave(arr=new_state, fname=episode_path + "/{:06d}.png".format(frame)) next_state = ops.convert_to_gray_n_resize(new_state) next_state = np.expand_dims(next_state, axis=2) next_state = np.expand_dims(next_state, axis=0) state = np.append(next_state, state[:, :, :, :3], axis=3) reward += r rewards.append(reward) print("Step: {}/500e3".format(step)) print("Game: {}/{}".format(p + 1, no_plays)) print("Reward: {}\n".format(reward)) if not log_dir is None: with open(log_dir + "/log.txt", "a") as log_file: log_file.write("Game: {}/{}\n".format(p + 1, no_plays)) log_file.write("Reward: {}\n".format(reward)) print("------------------------------------------------------------------------------------------------------") print("Best reward: {}".format(np.amax(rewards))) print("Average reward: {}".format(np.mean(rewards))) if not log_dir is None: with open(log_dir + "/log.txt", "a") as log_file: log_file.write("Best reward: {}\n".format(np.amax(rewards))) log_file.write("Average reward: {}\n".format(np.mean(rewards)))
def train(train_model=True): """ Trains the agent with hyperparameters and other info loaded from mission_control_<game>.py file :param train_model: bool, True -> Trains the agent False -> Loads the LATEST trained agent and plays :return: absolutely nothing """ with tf.variable_scope("Action_agent"): agent = get_agent(X_input) with tf.variable_scope("Target_agent"): target_agent = get_agent(X_input) loss = tf.losses.mean_squared_error(labels=Y_target, predictions=agent) var_list = tf.trainable_variables() agent_vars = [t for t in var_list if t.name.startswith("Action_agent")] optimizer = tf.train.RMSPropOptimizer(learning_rate=mc.learning_rate, momentum=mc.momentum, epsilon=mc.epsilon).minimize( loss, var_list=agent_vars) # Create the summary for tensorboard # TODO: Plot the rewards per episode tf.summary.scalar(name='loss', tensor=loss) tf.summary.scalar(name='max_q_value', tensor=tf.reduce_max( agent)) # TODO: Replace this to the op in the paper tf.summary.histogram(name='q_values_hist', values=agent) # TODO: Plot the length of each episode # TODO: Plot the argmax of the action taken for each play saver = tf.train.Saver() init = tf.global_variables_initializer() with tf.Session() as sess: if train_model: print("Training agent!") print("Preparing required directories") # Initialize global variables sess.run(init) # Used to measure time taken t1 = time.time() # Kinda like the global step, but is not a "Tensor" step = 0 # Get the initial epsilon prob_rand = mc.prob_random # TODO: Change this ASAP # Add epsilon to Tensorboard tf.summary.scalar('epsilon', tensor=prob_rand) summary_op = tf.summary.merge_all() replay_memory = deque() if mc.load_trained_model: saved_models = os.listdir(mc.logdir) latest_saved_model = sorted(saved_models)[-1] saver.restore( sess, tf.train.latest_checkpoint(mc.logdir + latest_saved_model + "/saved_models/")) with open( mc.logdir + latest_saved_model + "/saved_models/checkpoint", 'r') as checkout_file: line_1 = checkout_file.readline() step = int(line_1[30:-2]) tensorboard_dir = mc.logdir + latest_saved_model + "/Tensorboard/" saved_model_dir = mc.logdir + latest_saved_model + "/saved_models/" log_dir = mc.logdir + latest_saved_model + "/logs/" replay_memory = collect_rand_observations( replay_memory, sess, agent) else: replay_memory = collect_rand_observations( replay_memory) # Get the initial 50k random observations if not mc.load_trained_model: tensorboard_dir, saved_model_dir, log_dir = make_directories( mc.logdir) print("Tensorboard files stores in: {}".format(tensorboard_dir)) print("Saved models stored in: {}".format(saved_model_dir)) print("Log files stores in: {}".format(log_dir)) # File writer for tensorboard writer = tf.summary.FileWriter(logdir=tensorboard_dir, graph=sess.graph) game_rewards = [] # Save current mission control file with open("mission_control_breakout.py", "r") as mc_file: mission_control_file = mc_file.read() with open(log_dir + "/mission_control.txt", "w") as mc_writer: mc_writer.write(mission_control_file) for e in range(mc.n_episodes): with open(log_dir + "/log.txt", "a") as log_file: log_file.write( "--------------------------Episode: {}/{}------------------------------\n" .format(e + 1, mc.n_episodes)) print( "--------------------------Episode: {}/{}------------------------------\n" .format(e + 1, mc.n_episodes)) # Prepare first observation observation = env.reset() observation = ops.convert_to_gray_n_resize(observation) observation = np.expand_dims(observation, axis=2) state = np.repeat(observation, 4, axis=2) state = np.expand_dims(state, axis=0) # TODO: Only for breakout lives_left = 5 log_q_values = [] episode_rewards = [] for t in itertools.count(): mini_batch = random.sample(replay_memory, mc.batch_size) agent_input = [] agent_target = [] for s in range(len(mini_batch)): state_ = mini_batch[s][0] action_ = mini_batch[s][1] reward_ = mini_batch[s][2] next_state_ = mini_batch[s][3] done_ = mini_batch[s][4] life_lost = mini_batch[s][5] agent_input.append(state_[0]) target = sess.run(target_agent, feed_dict={X_input: state_}) if done_ or life_lost == 1: target[0, action_] = reward_ agent_target.append(target[0]) else: agent_output = sess.run( target_agent, feed_dict={X_input: next_state_}) target[0, action_] = reward_ + mc.gamma * ( np.amax(agent_output)) agent_target.append(target[0]) # Training the agent for 1 iterations. Finally!! for i in range(mc.fit_epochs): sess.run(optimizer, feed_dict={ X_input: agent_input, Y_target: agent_target }) # Copy trained parameters from the agent to the target network if (step + 1) % mc.target_network_update == 0: copy_parameters(sess) l, summary = sess.run([loss, summary_op], feed_dict={ X_input: agent_input, Y_target: agent_target }) writer.add_summary(summary, global_step=step) print("\rStep: {} ({}), Episode: {}/{}, Loss: {}".format( t, step, e + 1, mc.n_episodes, l), end="") sys.stdout.flush() # Collect the next observation if np.random.rand() < prob_rand: action = env.action_space.sample() else: q_prediction = sess.run(agent, feed_dict={X_input: state}) action = np.argmax(q_prediction) log_q_values.extend(q_prediction) next_state, reward, done, info = env.step(action) next_state = ops.convert_to_gray_n_resize(next_state) next_state = np.expand_dims(next_state, axis=2) next_state = np.expand_dims(next_state, axis=0) next_states = np.append(next_state, state[:, :, :, :3], axis=3) life_lost = 0 if lives_left - info['ale.lives'] > 0: life_lost = 1 lives_left -= 1 # Remove old samples from replay memory if it's full if len(replay_memory) > mc.observation_time: replay_memory.popleft() replay_memory.append( (state, action, reward, next_states, done, life_lost)) state = next_states episode_rewards.append(reward) step += 1 if (step + 1) % 10000 == 0: # Save the agent saved_path = saver.save(sess, saved_model_dir + '/model', global_step=step) prob_rand = anneal_epsilon(step) if mc.show_ui: env.render() if done: break with open(log_dir + "/log.txt", "a") as log_file: log_file.write( "Step: {} ({}), Play: {}/{}, Loss: {}\n".format( t, step, e + 1, mc.n_episodes, l)) log_file.write("Reward Obtained: {}\n".format( np.sum(episode_rewards))) game_rewards.append(np.sum(episode_rewards)) x_val = np.arange(e + 1) plt.plot(x_val, game_rewards) plt.xlabel("Episode") plt.ylabel("Reward Obtained") plt.savefig("{}/Rewards.png".format(log_dir)) plt.close() if log_q_values != []: log_file.write("Average Q Value: {}\n".format( np.mean(log_q_values))) else: log_file.write("All of the actions were random\n") print("\nReward Obtained: {}".format(np.sum(episode_rewards))) if log_q_values != []: print("Average Q Value: {}".format(np.mean(log_q_values))) else: print("All of the actions were random") print("Time taken of {} Plays on your potato: {:.4f}s".format( mc.n_episodes, time.time() - t1)) print("Average time for each Play: {:.4f}s".format( (time.time() - t1) / mc.n_episodes)) print("Tensorboard files saved in: {}".format(tensorboard_dir)) print("Model saved in: {}".format(saved_path)) print( "Model parameters stored in: {}".format(log_dir + "mission_control.txt")) print("Agent get to roll!") with open(log_dir + "/log.txt", "a") as log_file: log_file.write( "Time taken of {} episodes on your potato: {:.4f}s\n". format(mc.n_episodes, time.time() - t1)) log_file.write( "Average time for each episode: {:.4f}s\n".format( (time.time() - t1) / mc.n_episodes)) else: # Get the latest trained model saved_models = os.listdir(mc.logdir) latest_saved_model = sorted(saved_models)[-1] saver.restore( sess, tf.train.latest_checkpoint(mc.logdir + latest_saved_model + "/saved_models/")) print("Getting model from: {}".format(mc.logdir + latest_saved_model + "/saved_models/")) print( "------------------------Playing----------------------------") play(sess=sess, agent=agent, no_plays=mc.n_episodes, log_dir=None, show_ui=mc.show_ui, show_action=mc.show_action)