def play(args): # Create environment env = gym.make(args.env) num_actions = env.action_space.n state_buf = StateBuffer(args) # Define input placeholders state_ph = tf.placeholder( tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) # Instantiate DQN network DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main') DQN_predict_op = DQN.predict() # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Load ckpt file loader = tf.train.Saver() if args.ckpt_file is not None: ckpt = args.ckpt_dir + '/' + args.ckpt_file else: ckpt = tf.train.latest_checkpoint(args.ckpt_dir) loader.restore(sess, ckpt) print('%s restored.\n\n' % ckpt) for ep in range(0, args.num_eps): # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) step = 0 ep_done = False initial_steps = np.random.randint(1, args.max_initial_random_steps + 1) while not ep_done: time.sleep(0.05) env.render() # Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy. if step < initial_steps: action = env.action_space.sample() else: state = np.expand_dims(state_buf.get_state(), 0) action = sess.run(DQN_predict_op, {state_ph: state}) frame, _, ep_terminal, _ = env.step(action) frame = preprocess_image(frame, args.frame_width, args.frame_height) state_buf.add(frame) step += 1 # Episode can finish either by reaching terminal state or max episode steps if ep_terminal or step == args.max_ep_length: ep_done = True
def test(args): # Create environment env = gym.make(args.env) num_actions = env.action_space.n # Set random seeds for reproducability env.seed(args.random_seed) np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) # Initialise state buffer state_buf = StateBuffer(args) # Define input placeholders state_ph = tf.placeholder( tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) # Instantiate DQN network DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main') DQN_predict_op = DQN.predict() # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Load ckpt file loader = tf.train.Saver() if args.ckpt_file is not None: ckpt = args.ckpt_dir + '/' + args.ckpt_file else: ckpt = tf.train.latest_checkpoint(args.ckpt_dir) loader.restore(sess, ckpt) sys.stdout.write('%s restored.\n\n' % ckpt) sys.stdout.flush() ckpt_split = ckpt.split('-') train_ep = ckpt_split[-1] # Create summary writer to write summaries to disk if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Create summary op to save episode reward to Tensorboard log reward_var = tf.Variable(0.0, trainable=False) tf.summary.scalar("Average Test Reward", reward_var) summary_op = tf.summary.merge_all() ## Begin testing env.reset() rewards = [] for test_ep in range(args.num_eps_test): # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) ep_reward = 0 step = 0 ep_done = False initial_steps = np.random.randint(1, args.max_initial_random_steps + 1) sys.stdout.write('\n') sys.stdout.flush() while not ep_done: if args.render: env.render() else: env.render(mode='rgb_array') #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy. if step < initial_steps: test_action = env.action_space.sample() else: test_state = np.expand_dims(state_buf.get_state(), 0) test_action = sess.run(DQN_predict_op, {state_ph: test_state}) test_frame, test_reward, test_ep_terminal, _ = env.step( test_action) test_frame = preprocess_image(test_frame, args.frame_width, args.frame_height) state_buf.add(test_frame) ep_reward += test_reward step += 1 sys.stdout.write( '\x1b[2K\rTest episode {:d}/{:d} \t Steps = {:d} \t Reward = {:.2f}' .format(test_ep, args.num_eps_test, step, ep_reward)) sys.stdout.flush() # Episode can finish either by reaching terminal state or max episode steps if test_ep_terminal or step == args.max_ep_length: rewards.append(ep_reward) ep_done = True mean_reward = np.mean(rewards) error_reward = ss.sem(rewards) sys.stdout.write( '\n\nTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'. format(mean_reward, error_reward)) sys.stdout.flush() # Log average episode reward for Tensorboard visualisation summary_str = sess.run(summary_op, {reward_var: mean_reward}) summary_writer.add_summary(summary_str, train_ep) # Write results to file if args.results_file is not None: if not os.path.exists(args.results_dir): os.makedirs(args.results_dir) output_file = open(args.results_dir + '/' + args.results_file, 'a') output_file.write( 'Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n' .format(train_ep, mean_reward, error_reward)) output_file.flush() sys.stdout.write('Results saved to file \n\n') sys.stdout.flush() env.close()
def train(args): # Function to return exploration rate based on current step def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): if current_step < exp_step_end: exploration_rate = current_step * ( (exp_rate_end - exp_rate_start) / (float(exp_step_end))) + 1 else: exploration_rate = exp_rate_end return exploration_rate # Function to update target network parameters with main network parameters def update_target_network(from_scope, to_scope): from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) op_holder = [] # Update old network parameters with new network parameters for from_var, to_var in zip(from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder # Create environment env = gym.make(args.env) num_actions = env.action_space.n # Initialise replay memory and state buffer replay_mem = ReplayMemory(args) state_buf = StateBuffer(args) # Define input placeholders state_ph = tf.placeholder( tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) action_ph = tf.placeholder(tf.int32, (None)) target_ph = tf.placeholder(tf.float32, (None)) # Instantiate DQN network DQN = DeepQNetwork( num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main' ) # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) DQN_predict_op = DQN.predict() DQN_train_step_op = DQN.train_step() # Instantiate DQN target network DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') update_target_op = update_target_network('DQN_main', 'DQN_target') # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Add summaries for Tensorboard visualisation tf.summary.scalar('Loss', DQN.loss) reward_var = tf.Variable(0.0, trainable=False) tf.summary.scalar("Episode Reward", reward_var) epsilon_var = tf.Variable(args.epsilon_start, trainable=False) tf.summary.scalar("Exploration Rate", epsilon_var) summary_op = tf.summary.merge_all() # Define saver for saving model ckpts model_name = 'model.ckpt' checkpoint_path = os.path.join(args.ckpt_dir, model_name) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) saver = tf.train.Saver(max_to_keep=201) # Create summary writer to write summaries to disk if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Load ckpt file if given if args.ckpt_file is not None: loader = tf.train.Saver() #Restore all variables from ckpt ckpt = args.ckpt_dir + '/' + args.ckpt_file ckpt_split = ckpt.split('-') step_str = ckpt_split[-1] start_step = int(step_str) loader.restore(sess, ckpt) else: start_step = 0 sess.run(tf.global_variables_initializer()) sess.run(update_target_op) ## Begin training env.reset() ep_steps = 0 episode_reward = 0 episode_rewards = [] duration_values = [] # Initially populate replay memory by taking random actions sys.stdout.write('\nPopulating replay memory with random actions...\n') sys.stdout.flush() for random_step in range(1, args.initial_replay_mem_size + 1): if args.render: env.render() else: env.render(mode='rgb_array') action = env.action_space.sample() frame, reward, terminal, _ = env.step(action) frame = preprocess_image(frame, args.frame_width, args.frame_height) replay_mem.add(action, reward, frame, terminal) if terminal: env.reset() sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format( random_step, args.initial_replay_mem_size)) sys.stdout.flush() # Begin training process reset_env_and_state_buffer(env, state_buf, args) sys.stdout.write('\n\nTraining...\n\n') sys.stdout.flush() for train_step in range(start_step + 1, args.num_steps_train + 1): start_time = time.time() # Run 'train_frequency' iterations in the game for every training step for _ in range(0, args.train_frequency): ep_steps += 1 if args.render: env.render() else: env.render(mode='rgb_array') # Use an epsilon-greedy policy to select action epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end) if random.random() < epsilon: #Choose random action action = env.action_space.sample() else: #Choose action with highest Q-value according to network's current policy current_state = np.expand_dims(state_buf.get_state(), 0) action = sess.run(DQN_predict_op, {state_ph: current_state}) # Take action and store experience frame, reward, terminal, _ = env.step(action) frame = preprocess_image(frame, args.frame_width, args.frame_height) state_buf.add(frame) replay_mem.add(action, reward, frame, terminal) episode_reward += reward if terminal or ep_steps == args.max_ep_steps: # Collect total reward of episode episode_rewards.append(episode_reward) # Reset episode reward and episode steps counters episode_reward = 0 ep_steps = 0 # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) ## Training step # Get minibatch from replay mem states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch( ) # Calculate target by passing next states through the target network and finding max future Q future_Q = sess.run(DQN_target.output, {state_ph: next_states_batch}) max_future_Q = np.max(future_Q, axis=1) # Q values of the terminal states is 0 by definition max_future_Q[terminals_batch] = 0 targets = rewards_batch + (max_future_Q * args.discount_rate) # Execute training step if train_step % args.save_log_step == 0: # Train and save logs average_reward = sum(episode_rewards) / len(episode_rewards) summary_str, _ = sess.run( [summary_op, DQN_train_step_op], { state_ph: states_batch, action_ph: actions_batch, target_ph: targets, reward_var: average_reward, epsilon_var: epsilon }) summary_writer.add_summary(summary_str, train_step) # Reset rewards buffer episode_rewards = [] else: # Just train _ = sess.run( DQN_train_step_op, { state_ph: states_batch, action_ph: actions_batch, target_ph: targets }) # Update target networks if train_step % args.update_target_step == 0: sess.run(update_target_op) # Calculate time per step and display progress to console duration = time.time() - start_time duration_values.append(duration) ave_duration = sum(duration_values) / float(len(duration_values)) sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format( train_step, args.num_steps_train, ave_duration)) sys.stdout.flush() # Save checkpoint if train_step % args.save_ckpt_step == 0: saver.save(sess, checkpoint_path, global_step=train_step) sys.stdout.write('\n Checkpoint saved\n') sys.stdout.flush() # Reset time calculation duration_values = []