def play(args): # Create environment env = gym.make(args.env) num_actions = env.action_space.n state_buf = StateBuffer(args) # Define input placeholders state_ph = tf.placeholder( tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) # Instantiate DQN network DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main') DQN_predict_op = DQN.predict() # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Load ckpt file loader = tf.train.Saver() if args.ckpt_file is not None: ckpt = args.ckpt_dir + '/' + args.ckpt_file else: ckpt = tf.train.latest_checkpoint(args.ckpt_dir) loader.restore(sess, ckpt) print('%s restored.\n\n' % ckpt) for ep in range(0, args.num_eps): # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) step = 0 ep_done = False initial_steps = np.random.randint(1, args.max_initial_random_steps + 1) while not ep_done: time.sleep(0.05) env.render() # Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy. if step < initial_steps: action = env.action_space.sample() else: state = np.expand_dims(state_buf.get_state(), 0) action = sess.run(DQN_predict_op, {state_ph: state}) frame, _, ep_terminal, _ = env.step(action) frame = preprocess_image(frame, args.frame_width, args.frame_height) state_buf.add(frame) step += 1 # Episode can finish either by reaching terminal state or max episode steps if ep_terminal or step == args.max_ep_length: ep_done = True
def test(args): # Create environment env = gym.make(args.env) num_actions = env.action_space.n # Set random seeds for reproducability env.seed(args.random_seed) np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) # Initialise state buffer state_buf = StateBuffer(args) # Define input placeholders state_ph = tf.placeholder( tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) # Instantiate DQN network DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main') DQN_predict_op = DQN.predict() # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Load ckpt file loader = tf.train.Saver() if args.ckpt_file is not None: ckpt = args.ckpt_dir + '/' + args.ckpt_file else: ckpt = tf.train.latest_checkpoint(args.ckpt_dir) loader.restore(sess, ckpt) sys.stdout.write('%s restored.\n\n' % ckpt) sys.stdout.flush() ckpt_split = ckpt.split('-') train_ep = ckpt_split[-1] # Create summary writer to write summaries to disk if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Create summary op to save episode reward to Tensorboard log reward_var = tf.Variable(0.0, trainable=False) tf.summary.scalar("Average Test Reward", reward_var) summary_op = tf.summary.merge_all() ## Begin testing env.reset() rewards = [] for test_ep in range(args.num_eps_test): # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) ep_reward = 0 step = 0 ep_done = False initial_steps = np.random.randint(1, args.max_initial_random_steps + 1) sys.stdout.write('\n') sys.stdout.flush() while not ep_done: if args.render: env.render() else: env.render(mode='rgb_array') #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy. if step < initial_steps: test_action = env.action_space.sample() else: test_state = np.expand_dims(state_buf.get_state(), 0) test_action = sess.run(DQN_predict_op, {state_ph: test_state}) test_frame, test_reward, test_ep_terminal, _ = env.step( test_action) test_frame = preprocess_image(test_frame, args.frame_width, args.frame_height) state_buf.add(test_frame) ep_reward += test_reward step += 1 sys.stdout.write( '\x1b[2K\rTest episode {:d}/{:d} \t Steps = {:d} \t Reward = {:.2f}' .format(test_ep, args.num_eps_test, step, ep_reward)) sys.stdout.flush() # Episode can finish either by reaching terminal state or max episode steps if test_ep_terminal or step == args.max_ep_length: rewards.append(ep_reward) ep_done = True mean_reward = np.mean(rewards) error_reward = ss.sem(rewards) sys.stdout.write( '\n\nTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'. format(mean_reward, error_reward)) sys.stdout.flush() # Log average episode reward for Tensorboard visualisation summary_str = sess.run(summary_op, {reward_var: mean_reward}) summary_writer.add_summary(summary_str, train_ep) # Write results to file if args.results_file is not None: if not os.path.exists(args.results_dir): os.makedirs(args.results_dir) output_file = open(args.results_dir + '/' + args.results_file, 'a') output_file.write( 'Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n' .format(train_ep, mean_reward, error_reward)) output_file.flush() sys.stdout.write('Results saved to file \n\n') sys.stdout.flush() env.close()
def play(args): generate_gifs = args.gifs save_images = args.save or args.gifs print(save_images) render_mode = 'rgb_array' scale_image = 16 ACTION_SPACE = np.array([1, 2, 3, 4], dtype=np.uint8) # Function to get a random action def sample_action_space(): return random.choice(ACTION_SPACE) # Function to convert actionID (1, 2, 3, 4) to actionQID (0, 1, 2, 3) def actionID_to_actionQID(actionID): return actionID-1 # Function to convert actionQID (0, 1, 2, 3) to actionID (1, 2, 3, 4) def actionQID_to_actionID(actionQID): return actionQID+1 # Create environment env = gym.make(args.env) num_actions = 4 state_buf = StateBuffer(args) state_shape = (args.grid_height, args.grid_width, args.num_surfaces, args.grids_per_state) load_model_path = None # Creating target directory if images are to be stored if save_images and not os.path.exists('images'): try: os.makedirs('images') os.chdir('images') os.makedirs('steps') os.makedirs('gif') except OSError: print('Error: Creating images target directory. ') else: try: os.chdir('images') if not os.path.exists('steps'): try: os.makedirs('steps') except OSError: print('Error: Creating steps target directory. ') if not os.path.exists('gif'): try: os.makedirs('gif') except OSError: print('Error: Creating gif target directory. ') except OSError: print('Error: Entering images target directory. ') if args.checkpoint_file is not None: # Resume training load_model_path = os.path.join(args.checkpoint_dir, args.checkpoint_file) assert os.path.exists(load_model_path+'.index'), 'Path "{}" does not exist!'.format(load_model_path+'.index') start_step = args.checkpoint_file.split('/')[-1].split('-')[-1] assert len(start_step)>0, "Invalid checkpoint file for extracting start_step" start_step = int(start_step) else: # Train from scratch # Create another directory for this training args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.log_filename.split('.')[0]) start_step = 0 # Create checkpoint directory if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) DQN_target = DQNModel(state_shape, num_actions, load_model_path=load_model_path, name='DQN_target') for ep in range(0, args.num_eps): # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) step = 0 ep_done = False initial_steps = np.random.randint(1, args.max_initial_random_steps+1) while not ep_done: time.sleep(0.05) img = env.render(mode = render_mode) plt.imshow(img) display.clear_output(wait=True) display.display(plt.gcf()) #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy. if step < initial_steps: actionID = sample_action_space() else: if random.random() < args.epsilon_value: # Take random action actionID = sample_action_space() print("Random Action\n") else: # Take greedy action state = tf.convert_to_tensor(state_buf.get_state(), dtype=tf.float32) state = state[tf.newaxis, ...] # Add an axis for batch actionQID = DQN_target.predict(state) actionID = actionQID_to_actionID(int(actionQID)) # convert from Tensor to int print("Greedy Action\n") observation, reward, terminal, _ = env.step(actionID, observation_mode='tiny_rgb_array') grid = preprocess_observation(args, observation) state_buf.add(grid) step += 1 if save_images: img = Image.fromarray(np.array(env.render(render_mode, scale=scale_image)), 'RGB') img.save(os.path.join('steps', 'observation_{}_{}.png'.format(ep, step))) # Episode can finish either by reaching terminal state or max episode steps if terminal or step == args.max_ep_length: ep_done = True if generate_gifs: print('') import imageio with imageio.get_writer(os.path.join('gif', 'episode_{}.gif'.format(ep)), mode='I', fps=1) as writer: for t in range(args.max_ep_length): try: filename = os.path.join('steps', 'observation_{}_{}.png'.format(ep, t)) image = imageio.imread(filename) writer.append_data(image) except: pass
def train(args): # Function to return exploration rate based on current step def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): if current_step < exp_step_end: exploration_rate = current_step * ( (exp_rate_end - exp_rate_start) / (float(exp_step_end))) + 1 else: exploration_rate = exp_rate_end return exploration_rate # Function to update target network parameters with main network parameters def update_target_network(from_scope, to_scope): from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) op_holder = [] # Update old network parameters with new network parameters for from_var, to_var in zip(from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder # Create environment env = gym.make(args.env) num_actions = env.action_space.n # Initialise replay memory and state buffer replay_mem = ReplayMemory(args) state_buf = StateBuffer(args) # Define input placeholders state_ph = tf.placeholder( tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) action_ph = tf.placeholder(tf.int32, (None)) target_ph = tf.placeholder(tf.float32, (None)) # Instantiate DQN network DQN = DeepQNetwork( num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main' ) # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) DQN_predict_op = DQN.predict() DQN_train_step_op = DQN.train_step() # Instantiate DQN target network DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') update_target_op = update_target_network('DQN_main', 'DQN_target') # Create session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Add summaries for Tensorboard visualisation tf.summary.scalar('Loss', DQN.loss) reward_var = tf.Variable(0.0, trainable=False) tf.summary.scalar("Episode Reward", reward_var) epsilon_var = tf.Variable(args.epsilon_start, trainable=False) tf.summary.scalar("Exploration Rate", epsilon_var) summary_op = tf.summary.merge_all() # Define saver for saving model ckpts model_name = 'model.ckpt' checkpoint_path = os.path.join(args.ckpt_dir, model_name) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) saver = tf.train.Saver(max_to_keep=201) # Create summary writer to write summaries to disk if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Load ckpt file if given if args.ckpt_file is not None: loader = tf.train.Saver() #Restore all variables from ckpt ckpt = args.ckpt_dir + '/' + args.ckpt_file ckpt_split = ckpt.split('-') step_str = ckpt_split[-1] start_step = int(step_str) loader.restore(sess, ckpt) else: start_step = 0 sess.run(tf.global_variables_initializer()) sess.run(update_target_op) ## Begin training env.reset() ep_steps = 0 episode_reward = 0 episode_rewards = [] duration_values = [] # Initially populate replay memory by taking random actions sys.stdout.write('\nPopulating replay memory with random actions...\n') sys.stdout.flush() for random_step in range(1, args.initial_replay_mem_size + 1): if args.render: env.render() else: env.render(mode='rgb_array') action = env.action_space.sample() frame, reward, terminal, _ = env.step(action) frame = preprocess_image(frame, args.frame_width, args.frame_height) replay_mem.add(action, reward, frame, terminal) if terminal: env.reset() sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format( random_step, args.initial_replay_mem_size)) sys.stdout.flush() # Begin training process reset_env_and_state_buffer(env, state_buf, args) sys.stdout.write('\n\nTraining...\n\n') sys.stdout.flush() for train_step in range(start_step + 1, args.num_steps_train + 1): start_time = time.time() # Run 'train_frequency' iterations in the game for every training step for _ in range(0, args.train_frequency): ep_steps += 1 if args.render: env.render() else: env.render(mode='rgb_array') # Use an epsilon-greedy policy to select action epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end) if random.random() < epsilon: #Choose random action action = env.action_space.sample() else: #Choose action with highest Q-value according to network's current policy current_state = np.expand_dims(state_buf.get_state(), 0) action = sess.run(DQN_predict_op, {state_ph: current_state}) # Take action and store experience frame, reward, terminal, _ = env.step(action) frame = preprocess_image(frame, args.frame_width, args.frame_height) state_buf.add(frame) replay_mem.add(action, reward, frame, terminal) episode_reward += reward if terminal or ep_steps == args.max_ep_steps: # Collect total reward of episode episode_rewards.append(episode_reward) # Reset episode reward and episode steps counters episode_reward = 0 ep_steps = 0 # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) ## Training step # Get minibatch from replay mem states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch( ) # Calculate target by passing next states through the target network and finding max future Q future_Q = sess.run(DQN_target.output, {state_ph: next_states_batch}) max_future_Q = np.max(future_Q, axis=1) # Q values of the terminal states is 0 by definition max_future_Q[terminals_batch] = 0 targets = rewards_batch + (max_future_Q * args.discount_rate) # Execute training step if train_step % args.save_log_step == 0: # Train and save logs average_reward = sum(episode_rewards) / len(episode_rewards) summary_str, _ = sess.run( [summary_op, DQN_train_step_op], { state_ph: states_batch, action_ph: actions_batch, target_ph: targets, reward_var: average_reward, epsilon_var: epsilon }) summary_writer.add_summary(summary_str, train_step) # Reset rewards buffer episode_rewards = [] else: # Just train _ = sess.run( DQN_train_step_op, { state_ph: states_batch, action_ph: actions_batch, target_ph: targets }) # Update target networks if train_step % args.update_target_step == 0: sess.run(update_target_op) # Calculate time per step and display progress to console duration = time.time() - start_time duration_values.append(duration) ave_duration = sum(duration_values) / float(len(duration_values)) sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format( train_step, args.num_steps_train, ave_duration)) sys.stdout.flush() # Save checkpoint if train_step % args.save_ckpt_step == 0: saver.save(sess, checkpoint_path, global_step=train_step) sys.stdout.write('\n Checkpoint saved\n') sys.stdout.flush() # Reset time calculation duration_values = []
def train(args): ACTION_SPACE = np.array([1, 2, 3, 4], dtype=np.uint8) # Function to get a random actionID def sample_action_space(): return random.choice(ACTION_SPACE) # Function to convert actionID (1, 2, 3, 4) to actionQID (0, 1, 2, 3) def actionID_to_actionQID(actionID): return actionID-1 # Function to convert actionQID (0, 1, 2, 3) to actionID (1, 2, 3, 4) def actionQID_to_actionID(actionQID): return actionQID+1 # Function to return epsilon based on current step def get_epsilon(current_step, epsilon_start, epsilon_end, epsilon_decay_step): if current_step < epsilon_decay_step: return epsilon_start + (epsilon_end - epsilon_start) / float(epsilon_decay_step) * current_step else: return epsilon_end # Get logger for training logger = logging.getLogger('train') # Check if GPU is available logger.info("Num GPUs Available: %d", len(tf.config.experimental.list_physical_devices('GPU'))) # Create environment env = gym.make(args.env) num_actions = 4 # Push (up, down, left, right): 1, 2, 3, 4 env.unwrapped.set_maxsteps(args.max_step) env.unwrapped.set_rewards( [args.env_penalty_for_step, args.env_reward_box_on_target, args.env_penalty_box_off_target, args.env_reward_finished]) # Set random seeds for reproducability random.seed(args.random_seed) env.seed(args.random_seed) np.random.seed(args.random_seed) tf.random.set_seed(args.random_seed) # Initialize replay memory and state buffer replay_mem = ReplayMemory(args) state_buf = StateBuffer(args) # Check if resume from training load_model_path = None if args.checkpoint_file is not None: # Resume training load_model_path = os.path.join(args.checkpoint_dir, args.checkpoint_file) assert os.path.exists(load_model_path+'.index'), 'Path "{}" does not exist!'.format(load_model_path+'.index') start_step = args.checkpoint_file.split('/')[-1].split('-')[-1] assert len(start_step)>0, "Invalid checkpoint file for extracting start_step" start_step = int(start_step) else: # Train from scratch # Create another directory for this training args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.log_filename.split('.')[0]) start_step = 0 # Create checkpoint directory if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) # Instantiate DQN and DQN_target state_shape = (args.grid_height, args.grid_width, args.num_surfaces, args.grids_per_state) DQN = DQNModel(state_shape, num_actions, args.learning_rate, load_model_path=load_model_path, name='DQN') DQN_target = DQNModel(state_shape, num_actions, load_model_path=load_model_path, name='DQN_target') ## Begin training env.reset() # Populate replay memory to initial_replay_mem_size logger.info("Populating replay memory with random actions...") for si in range(args.initial_replay_mem_size): if args.render: env.render(mode='human') else: env.render(mode='tiny_rgb_array') actionID = sample_action_space() observation, reward, terminal, _ = env.step(actionID, observation_mode='tiny_rgb_array') grid = preprocess_observation(args, observation) replay_mem.add(actionID, reward, grid, terminal) if terminal: env.reset() sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(si+1, args.initial_replay_mem_size)) sys.stdout.flush() # Start training reward_one_episode = 0 reward_episodes = [] step_one_episode = 0 step_episodes = [] Qval_steps = [] duration_steps = [] # Create tf summary writer to write summaries to disk # ./logs/train/20200318_120026 tf_train_log_dir = os.path.join(args.log_dir.replace('train', 'tf_train'), args.log_filename.split('.')[0]) if not os.path.exists(tf_train_log_dir): os.makedirs(tf_train_log_dir) train_summary_writer = tf.summary.create_file_writer(tf_train_log_dir) train_summary_writer.set_as_default() if args.save_tb_trace: # Model graphs tf.summary.trace_on(graph=True, profiler=True) reset_env_and_state_buffer(env, state_buf, args) logger.info("Start training...") for si in range(start_step+1, args.num_steps_train+1): start_time = time.time() ## Playing Step # Perform a step if args.render: env.render(mode='human') else: env.render(mode='tiny_rgb_array') # Select a random action based on epsilon-greedy algorithm epsilon = get_epsilon(si, args.epsilon_start, args.epsilon_end, args.epsilon_decay_step) if random.random() < epsilon: # Take random action actionID = sample_action_space() else: # Take greedy action state = tf.convert_to_tensor(state_buf.get_state(), dtype=tf.float32) state = state[tf.newaxis, ...] # Add an axis for batch actionQID = DQN.predict(state) actionID = actionQID_to_actionID(int(actionQID)) # convert from Tensor to int # Take the action and store state transition observation, reward, terminal, _ = env.step(actionID, observation_mode='tiny_rgb_array') grid = preprocess_observation(args, observation) state_buf.add(grid) replay_mem.add(actionID, reward, grid, terminal) # Accumulate reward and increment step reward_one_episode += reward step_one_episode += 1 if terminal: # Save the accumulate reward for this episode reward_episodes.append(reward_one_episode) reward_one_episode = 0 # Save the number of steps for this episode step_episodes.append(step_one_episode) step_one_episode = 0 # Reset environment and state buffer reset_env_and_state_buffer(env, state_buf, args) ## Training Step # Sample a random minibatch of transitions from ReplayMemory states_batch, actionID_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() actionQID_batch = actionID_to_actionQID(actionID_batch) # Infer DQN_target for Q(S', A) next_states_batch = tf.convert_to_tensor(next_states_batch, dtype=tf.float32) next_states_Qvals = DQN_target.infer(next_states_batch) max_next_states_Qvals = tf.math.reduce_max(next_states_Qvals, axis=1, name='maxQ') assert max_next_states_Qvals.shape == (args.batch_size,), "Wrong dimention for predicted next state Q vals" # Set Q(S', A) for all terminal state S' max_next_states_Qvals = tf.math.multiply(max_next_states_Qvals, np.invert(terminals_batch), name='remove_terminals') # Save average maximum predicted Q values Qval_steps.append(np.mean(max_next_states_Qvals[max_next_states_Qvals != 0])) # Calculate the traget Q values targetQs = rewards_batch + args.discount_rate * max_next_states_Qvals # Pass to DQN states_batch = tf.cast(states_batch, tf.float32) targetQs = tf.cast(targetQs, tf.float32) DQN.train_step(states_batch, actionQID_batch, targetQs) # Update DQN_target every args.update_target_step steps if si % args.update_target_step == 0: update_save_path = os.path.join(args.checkpoint_dir, 'DQN_Update') DQN.save_model(update_save_path) DQN_target.load_model(update_save_path) duration = time.time() - start_time duration_steps.append(duration) # Save log if si % args.save_log_step == 0: avg_training_loss = DQN.get_training_loss() logger.info("{Training Step: %d/%d}", si, args.num_steps_train) logger.info("Number of Episodes: %d", len(reward_episodes)) logger.info("Recent Step Exploration Rate: %.5f", epsilon) logger.info("Average Per-Episode Reward: %.5f", sum(reward_episodes)/float(len(reward_episodes))) logger.info("Average Per-Episode Step: %.3f", sum(step_episodes)/float(len(step_episodes))) logger.info("Average Per-Step Maximum Predicted Q Value: %.8f", sum(Qval_steps)/float(len(Qval_steps))) logger.info("Average Per-Step Training Loss: %.8f", avg_training_loss) logger.info("Average Per-Step Training Time: %.5f second", sum(duration_steps)/float(len(duration_steps))) tf.summary.scalar('Episodes', len(reward_episodes), step=si, description='Number of Episodes') tf.summary.scalar('epsilon', epsilon, step=si, description='Recent Step Exploration Rate') tf.summary.scalar('avgReward', sum(reward_episodes)/float(len(reward_episodes)), step=si, description='Average Per-Episode Reward') tf.summary.scalar('avgStep', sum(step_episodes)/float(len(step_episodes)), step=si, description='Average Per-Episode Step Count') tf.summary.scalar('avgQval', sum(Qval_steps)/float(len(Qval_steps)), step=si, description='Average Per-Step Maximum Predicted Q Value') tf.summary.scalar('avgTrainLoss', avg_training_loss, step=si, description='Average Per-Step Training Loss') tf.summary.scalar('avgTrainTime', sum(duration_steps)/float(len(duration_steps)), step=si, description='Average Per-Step Training Time') if args.save_tb_trace: # Save computation graph tf.summary.trace_export(name="model_trace", step=si, profiler_outdir=tf_train_log_dir) # Reset the parameters reward_episodes = [] step_episodes = [] duration_steps = [] Qval_steps = [] # Save checkpoint if si % args.save_checkpoint_step == 0: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'DQN_Train') DQN.save_model(save_checkpoint_path, ckpt_number=si) # Duplicate the current logfile src_log_filepath = os.path.join(args.log_dir, args.log_filename) dst_log_filepath = os.path.join(args.checkpoint_dir, 'DQN_Train_{}.log'.format(si)) shutil.copyfile(src_log_filepath, dst_log_filepath) # Training finished logger.info("Finished training...") # Save trained network save_final_network_path = os.path.join(args.checkpoint_dir, 'DQN_Trained') DQN.save_model(save_final_network_path, ckpt_number=args.num_steps_train)
def test(args): ACTION_SPACE = np.array([1, 2, 3, 4], dtype=np.uint8) # Function to get a random action def sample_action_space(): return random.choice(ACTION_SPACE) # Function to convert actionID (1, 2, 3, 4) to actionQID (0, 1, 2, 3) def actionID_to_actionQID(actionID): return actionID - 1 # Function to convert actionQID (0, 1, 2, 3) to actionID (1, 2, 3, 4) def actionQID_to_actionID(actionQID): return actionQID + 1 def get_actionID(step, initial_steps, epsilon): #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy. if step < initial_steps: actionID = sample_action_space() else: if random.random() < epsilon: # Take random action actionID = sample_action_space() else: # Take greedy action state = tf.convert_to_tensor(state_buf.get_state(), dtype=tf.float32) state = state[tf.newaxis, ...] # Add an axis for batch actionQID = DQN_target.predict(state) actionID = actionQID_to_actionID( int(actionQID)) # convert from Tensor to int return actionID # Create environment env = gym.make(args.env) # Set random seeds for reproducability env.seed(args.random_seed) np.random.seed(args.random_seed) tf.random.set_seed(args.random_seed) state_buf = StateBuffer(args) state_shape = (args.grid_height, args.grid_width, args.num_surfaces, args.grids_per_state) num_actions = 4 epsilons = [0.1] # [0.9, 0.5, 0.28, 0.2, 0.15, 0.1, 0.05] checkpoint_paths = get_checkpoint_paths(args) num_checkpoints = len(checkpoint_paths) out_file = open(args.results_file, "a+") out_file.write( "pathStr, epsilon, mean_reward, error_reward, mean_step, ons, offs, wins\n\r" ) for epsilon in epsilons: for cp_id in range(0, num_checkpoints): path = checkpoint_paths[cp_id] out_str = "Starting Checkpoint test: {} \t {}/{} \t Epsilon: {}\n\r".format( path, cp_id + 1, num_checkpoints, epsilon) #output(out_str, out_file) print(out_str) #if args.checkpoint_list is not None: DQN_target = DQNModel(state_shape, num_actions, load_model_path=path, name='DQN_target') #Begin Testing rewards = [] step_totals = [] cp_totals = Counts() for ep in range(0, args.num_eps): # Reset environment and state buffer for next episode reset_env_and_state_buffer(env, state_buf, args) ep_reward = 0 ep_totals = Counts() step = 0 ep_done = False initial_steps = np.random.randint( 1, args.max_initial_random_steps + 1) while not ep_done: if args.render: env.render() else: env.render(mode='tiny_rgb_array') actionID = get_actionID(step, initial_steps, epsilon) observation, reward, terminal, _ = env.step( actionID, observation_mode='tiny_rgb_array') grid = preprocess_observation(args, observation) state_buf.add(grid) step += 1 ep_reward += reward ep_totals.reward_update(reward) # Episode can finish either by reaching terminal state or max episode steps if terminal or step == args.max_step: cp_totals.update_all(ep_totals) step_totals.append(step) out_str = 'Test ep {:d}/{:d} \t Steps = {:d} \t Reward = {:.2f} \t\n\r'.format( ep + 1, args.num_eps, step, ep_reward, actionID) #output(out_str, out_file) print(out_str) out_str = ep_totals.get_str() #output(out_str, out_file) print(out_str) rewards.append(ep_reward) ep_done = True mean_step = np.mean(step_totals) mean_reward = np.mean(rewards) error_reward = ss.sem(rewards) if not path: pathStr = "Beginning" else: pathStr = path out_str = pathStr + ' Checkpoint Testing complete \n\r' #output(out_str, out_file) print(out_str) out_str = 'Average reward = {:.2f} +/- {:.2f} /ep\t Average steps: {}\n\r'.format( mean_reward, error_reward, mean_step) #output(out_str, out_file) print(out_str) out_str = 'Totals: ' + cp_totals.get_str() + '\tEpsilon: ' + str( epsilon) + '\n\r\n\r' print(out_str) #output(out_str, out_file) out_str = '{},{},{:.2f},{:.2f},{:.2f},{},{},{},\n\r'.format( pathStr, epsilon, mean_reward, error_reward, mean_step, cp_totals.on, cp_totals.off, cp_totals.win) output(out_str, out_file) out_file.close() env.close()