game.new_episode() game_state = game.get_state() misc = game_state.game_variables # [KILLCOUNT, AMMO, HEALTH] prev_misc = misc action_size = game.get_available_buttons_size() img_rows, img_cols = 64, 64 # Convert image into Black and white img_channels = 4 # We stack 4 frames state_size = (img_rows, img_cols, img_channels) agent = DoubleDQNAgent(state_size, action_size) agent.model = Networks.dueling_dqn(state_size, action_size, agent.learning_rate) agent.target_model = Networks.dueling_dqn(state_size, action_size, agent.learning_rate) x_t = game_state.screen_buffer # 480 x 640 x_t = preprocessImg(x_t, size=(img_rows, img_cols)) s_t = np.stack(([x_t] * 4), axis=2) # It becomes 64x64x4 s_t = np.expand_dims(s_t, axis=0) # 1x64x64x4 is_terminated = game.is_episode_finished() # Start training epsilon = agent.initial_epsilon GAME = 0 t = 0 max_life = 0 # Maximum episode life (Proxy for agent performance)
def main(): env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1') # Parameters for observation image size processing. img_rows = 128 img_cols = 128 img_stack = 4 action_size = 8 # 8 valid button combinations # Inputs to the agent's prediction network will have the following shape. input_size = (img_rows, img_cols, img_stack) # File paths stat_path = '../statistics/dqn_n-step' model_path = '../models/dqn_n-step' # Priortized Experience Replay. if (PER_AGENT): print('PER agent') stat_path += '_PER' model_path += '_PER' dqn_agent = DQN_PER_Agent(input_size, action_size) else: dqn_agent = DQN_Agent(input_size, action_size) # Use the Noisy Dueling Network. if (NOISY): stat_path += '_noisy_dueling' model_path += '_noisy_dueling' print('NOISY Dueling agent') dqn_agent.main_model = Networks.noisy_dueling_dqn( input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.noisy_dueling_dqn( input_size, action_size, dqn_agent.target_lr) dqn_agent.noisy = True # Use the normal dueling network. elif (DUELING): stat_path += '_dueling' model_path += '_dueling' print('Dueling agent') dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.target_lr) # Normal DQN. else: dqn_agent.main_model = Networks.dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dqn(input_size, action_size, dqn_agent.target_lr) # Append correct suffix and filetype to paths. stat_path += '_stats.csv' main_model_path = model_path + '_main.h5' target_model_path = model_path + '_target.h5' # Load previous models, or instantiate new networks. if (LOAD_MODELS): dqn_agent.load_models(main_model_path, target_model_path) # Modify statrting epsilon value if (EPSILON == START): dqn_agent.epsilon = dqn_agent.initial_epsilon elif (EPSILON == MIDDLE): dqn_agent.epsilon = ( (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2) else: dqn_agent.epsilon = dqn_agent.final_epsilon # Store rewards and states from the previous n state, action pairs to # create experiences. prev_n_rewards = deque(maxlen=dqn_agent.n_step) prev_n_exp = deque(maxlen=dqn_agent.n_step) # One episode is 4500 steps if not completed # 5 minutes of frames at 1/15th of a second = 4 60Hz frames total_timestep = 0 # Total number of timesteps over all episodes. for episode in range(EPISODES): done = False reward_sum = 0 # Average reward within episode. timestep = 0 # Track timesteps within the episode. # Rewards and states must be consecutive to improve temporal awareness. # Reset at the start of each episode to compensate for sudden scene change. prev_n_rewards.clear() prev_n_exp.clear() # Experiences are a stack of the img_stack most frames to provide # temporal information. Initialize this sequence to the first # observation stacked 4 times. first_obs = env.reset() processed = preprocess_obs(first_obs, size=(img_rows, img_cols)) # (img_rows, img_cols, img_stack) exp_stack = np.stack(([processed] * img_stack), axis=2) # Expand dimension to stack and submit multiple exp_stacks in a batch # (1, img_rows, img_cols, img_stack). exp_stack = np.expand_dims(exp_stack, axis=0) # 1x64x64x4 # Continue until the end of the zone is reached or 4500 timesteps have # passed. while not done: # Predict an action to take based on the most recent # experience. # # Note that the first dimension # (1, img_rows, img_cols, img_stack) is ignored by the # network here as it represents a batch size of 1. act_idx, action = dqn_agent.act(exp_stack) obs, reward, done, info = env.step(action) # env.render() timestep += 1 total_timestep += 1 reward_sum += reward # Create a 1st dimension for stacking experiences and a 4th for # stacking img_stack frames. obs = preprocess_obs(obs, size=(img_rows, img_cols)) obs = np.reshape(obs, (1, img_rows, img_cols, 1)) # Append the new observation to the front of the stack and remove # the oldest (4th) frame. exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3) # Save the previous state, selected action, and resulting reward prev_n_rewards.appendleft(reward) prev_n_exp.append((exp_stack, act_idx, done)) exp_stack = exp_stack_new # Once sufficent steps have been taken, discount rewards and save nth # previous experience if (len(prev_n_rewards) >= dqn_agent.n_step): # Compute discounted reward discounted_reward = 0 for idx in range(len(prev_n_rewards)): prev_reward = prev_n_rewards[idx] # rewards are append left so that the most recent rewards are # discounted the least. discounted_reward += ((dqn_agent.gamma**idx) * prev_reward) # Experiences are pushed forward into the deque as more are appened. The # nth previous experience is at the last index. original_state, original_act, _ = prev_n_exp[-1] nth_state, _, nth_done = prev_n_exp[0] # Save the nth previous state and predicted action the discounted sum of rewards # and final state over the next n steps. dqn_agent.save_memory(original_state, original_act, discounted_reward, nth_state, nth_done) # In the observation phase skip training updates and decrmenting epsilon. if (total_timestep >= dqn_agent.observation_timesteps): # Update the target model with the main model's weights. if ((total_timestep % dqn_agent.update_target_freq) == 0): dqn_agent.update_target_model() # Train the agent on saved experiences. if ((total_timestep % dqn_agent.timestep_per_train) == 0): dqn_agent.replay_update() dqn_agent.save_models(main_model_path, target_model_path) if (dqn_agent.epsilon > dqn_agent.final_epsilon): # Decrease epsilon by a fraction of the range such that epsilon decreases # for "exploration_timesteps". dec = ( (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / dqn_agent.exploration_timesteps) dqn_agent.epsilon -= dec # print(info) print("Epsisode:", episode, " Timestep:", timestep, " Action:", act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:", dqn_agent.epsilon) # Save mean episode reward at the end of the episode - append to stats file with open(stat_path, "a") as stats_fd: reward_str = "Epsiode Cummulative Reward: " + str( reward_sum) + ", Episode Timestpes: " + str(timestep) + ",\n" stats_fd.write(str(reward_str))
def main(): env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1') # Parameters for observation image size processing. img_rows = 128 img_cols = 128 img_stack = 4 action_size = 8 # 8 valid button combinations # Inputs to the agent's prediction network will have the following shape. input_size = (img_rows, img_cols, img_stack) # File paths stat_path = '../statistics/dqn' model_path = '../models/dqn' # Priortized Experience Replay. if (PER_AGENT): print('PER agent') stat_path += '_PER' model_path+= '_PER' dqn_agent = DQN_PER_Agent(input_size, action_size) elif(DIST_AGENT): stat_path += '_DIST' model_path+= '_DIST' dqn_agent = DistributionalDQN(input_size, action_size) else: dqn_agent = DQN_Agent(input_size, action_size) # Use the Noisy Dueling Network. if (NOISY): stat_path += '_noisy_dueling' model_path += '_noisy_dueling' print('NOISY Dueling agent') dqn_agent.main_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.target_lr) dqn_agent.noisy = True # Use the normal dueling network. elif(DUELING and DIST_AGENT): stat_path += '_dueling' model_path += '_dueling' print('Dueling distributional') dqn_agent.main_model = Networks.dueling_C51(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dueling_C51(input_size, action_size, dqn_agent.target_lr) elif (DUELING): stat_path += '_dueling' model_path += '_dueling' print('Dueling agent') dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.target_lr) elif(DIST_AGENT): dqn_agent.main_model = Networks.C51(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.C51(input_size, action_size, dqn_agent.target_lr) # Normal DQN. else: dqn_agent.main_model = Networks.dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dqn(input_size, action_size, dqn_agent.target_lr) # Append correct suffix and filetype to paths. stat_path += '_stats.csv' main_model_path = model_path + '_main.h5' target_model_path = model_path + '_target.h5' # Load previous models. if (LOAD_MODELS): dqn_agent.load_models(main_model_path, target_model_path) # Modify statrting epsilon value if (EPSILON == START): dqn_agent.epsilon = dqn_agent.initial_epsilon elif (EPSILON == MIDDLE): dqn_agent.epsilon = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2) else: dqn_agent.epsilon = dqn_agent.final_epsilon # One episode is 4500 steps if not completed # 5 minutes of frames at 1/15th of a second = 4 60Hz frames total_timestep = 0 # Total number of timesteps over all episodes. for episode in range(EPISODES): done = False reward_sum = 0 # Average reward within episode. timestep = 0 # Track timesteps within the episode. first_obs = env.reset() # Experiences are a stack of the img_stack most frames to provide # temporal information. Initialize this sequence to the first # observation stacked 4 times. processed = preprocess_obs(first_obs, size=(img_rows, img_cols)) # (img_rows, img_cols, img_stack) exp_stack = np.stack(([processed]*img_stack), axis = 2) # Expand dimension to stack and submit multiple exp_stacks in a batch # (1, img_rows, img_cols, img_stack). exp_stack = np.expand_dims(exp_stack, axis=0) # 1x64x64x4 # Punish the agent for not moving forward prev_state = {} steps_stuck = 0 # Continue until the end of the zone is reached or 4500 timesteps have # passed. while not done: # Predict an action to take based on the most recent # experience. # # Note that the first dimension # (1, img_rows, img_cols, img_stack) is ignored by the # network here as it represents a batch size of 1. act_idx, action = dqn_agent.act(exp_stack) obs, reward, done, info = env.step(action) # env.render() # Punish the agent for standing still for too long. if (prev_state == info): steps_stuck += 1 else: steps_stuck = 0 prev_state = info # Position based reward does not include stagnation punishment. reward_sum += reward if (steps_stuck > 20): reward -= 1 # Track various events timestep += 1 total_timestep += 1 obs = preprocess_obs(obs, size=(img_rows, img_cols)) # Create a 1st dimension for stacking experiences and a 4th for # stacking img_stack frames. obs = np.reshape(obs, (1, img_rows, img_cols, 1)) # Append the new observation to the front of the stack and remove # the oldest (4th) frame. exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3) # Save the experience: <state, action, reward, next_state, done>. dqn_agent.save_memory(exp_stack, act_idx, reward, exp_stack_new, done) exp_stack = exp_stack_new # In the observation phase skip training updates and decrmenting epsilon. if (total_timestep >= dqn_agent.observation_timesteps): # Update the target model with the main model's weights. if ((total_timestep % dqn_agent.update_target_freq) == 0): dqn_agent.update_target_model() # Train the agent on saved experiences. if ((total_timestep % dqn_agent.timestep_per_train) == 0): dqn_agent.replay_update() dqn_agent.save_models(main_model_path, target_model_path) if (dqn_agent.epsilon > dqn_agent.final_epsilon): # Decrease epsilon by a fraction of the range such that epsilon decreases # for "exploration_timesteps". dec = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / dqn_agent.exploration_timesteps) dqn_agent.epsilon -= dec # print(info) print("Epsisode:", episode, " Timestep:", timestep, " Action:", act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:", dqn_agent.epsilon) # Save mean episode reward at the end of the episode - append to stats file with open(stat_path, "a") as stats_fd: reward_str = "Epsiode Cummulative Reward: " + str(reward_sum) + ", Episode Timestpes: " + str(timestep) + ",\n" stats_fd.write(str(reward_str))