def random_play_and_save(self, env, current_state, act_list): # Decide action action_index = env.action_space.sample() # Check if all previous actions are NOOP all_zeros = all(p == 0 for p in act_list) # Change action if all previous actions are NOOP and current is NOOP if all_zeros and action_index == 0: action_index = random.randint(1, env.action_space.n - 1) # Build action vector action = np.eye(self.action_size, dtype=np.int8)[action_index] # Advance the game to the next state based on the action. obs, reward, is_done, _ = env.step(action_index) # Pre-process observation obs = preprocess(obs) # Build next state next_state = get_next_state(current_state, obs) # Pre-process reward transformed_reward = transform_reward(reward) # Remember the previous state, action, reward, and done self.memory.append((current_state, action, transformed_reward, next_state, is_done)) return next_state, reward, is_done, action_index
def q_iteration(self, env, current_state, act_list): # Choose the action if random.random() < self.epsilon: action_index = env.action_space.sample() else: action_index = self.choose_best_action(current_state) # Check if all previous actions are NOOP all_zeros = all(p == 0 for p in act_list) # Change action if all previous actions are NOOP and current is NOOP if all_zeros and action_index == 0: action_index = random.randint(1, env.action_space.n - 1) # Build action vector action = np.eye(self.action_size, dtype=np.int8)[action_index] # Play one game iteration obs, reward, is_done, _ = env.step(action_index) # Pre-process observation obs = preprocess(obs) # Build next state next_state = get_next_state(current_state, obs) # Pre-process reward transformed_reward = transform_reward(reward) # Remember the previous state, action, reward, and done self.memory.append((current_state, action, transformed_reward, next_state, is_done)) # Sample and fit batch = self.memory.sample_batch(self.batch_size) self.fit_batch(batch) return next_state, reward, is_done, action_index
def nonrandom_play_and_save(self, env, current_state, act_list): """This method is designed to be used when you need to continue training after a (initial)training session is finished. I have found that initializing the memory with random play in that situation can make the agent to diverge.""" # Choose the action according to the behaviour policy if random.random() < 0.05: action_index = env.action_space.sample() else: action_index = self.choose_best_action(current_state) # Check if all previous actions are NOOP all_zeros = all(p == 0 for p in act_list) # Change action if all previous actions are NOOP and current is NOOP if all_zeros and action_index == 0: action_index = random.randint(1, env.action_space.n - 1) # Build action vector action = np.eye(self.action_size, dtype=np.int8)[action_index] # Advance the game to the next state based on the action. obs, reward, is_done, _ = env.step(action_index) # Pre-process observation obs = preprocess(obs) # Build next state next_state = get_next_state(current_state, obs) # Pre-process reward transformed_reward = transform_reward(reward) # Remember the previous state, action, reward, and done self.memory.append((current_state, action, transformed_reward, next_state, is_done)) return next_state, reward, is_done, action_index
for time_step in xrange(20000): # print "episode:", e, "time_step:", time_step # turn this on if you want to render # env.render() # Choose the action according to the behaviour policy if random.random() < 0.05: action_index = env.action_space.sample() else: action_index = agent.choose_best_action(current_state) # Play one game iteration raw_obs, reward, is_done, _ = env.step(action_index) obs = preprocess(raw_obs) next_state = get_next_state(current_state, obs) # make next_state the new current state for the next frame. current_state = next_state # Update return episode_return += reward # episode_return += transform_reward(reward) # imgplot = plt.imshow(raw_obs) # plt.show() # raw_input("Press Enter to continue...") # is_done becomes True when the game ends if is_done: # print the score and break out of the loop