def random_play_and_save(self, env, current_state, act_list): # Decide action action_index = env.action_space.sample() # Check if all previous actions are NOOP all_zeros = all(p == 0 for p in act_list) # Change action if all previous actions are NOOP and current is NOOP if all_zeros and action_index == 0: action_index = random.randint(1, env.action_space.n - 1) # Build action vector action = np.eye(self.action_size, dtype=np.int8)[action_index] # Advance the game to the next state based on the action. obs, reward, is_done, _ = env.step(action_index) # Pre-process observation obs = preprocess(obs) # Build next state next_state = get_next_state(current_state, obs) # Pre-process reward transformed_reward = transform_reward(reward) # Remember the previous state, action, reward, and done self.memory.append((current_state, action, transformed_reward, next_state, is_done)) return next_state, reward, is_done, action_index
def q_iteration(self, env, current_state, act_list): # Choose the action if random.random() < self.epsilon: action_index = env.action_space.sample() else: action_index = self.choose_best_action(current_state) # Check if all previous actions are NOOP all_zeros = all(p == 0 for p in act_list) # Change action if all previous actions are NOOP and current is NOOP if all_zeros and action_index == 0: action_index = random.randint(1, env.action_space.n - 1) # Build action vector action = np.eye(self.action_size, dtype=np.int8)[action_index] # Play one game iteration obs, reward, is_done, _ = env.step(action_index) # Pre-process observation obs = preprocess(obs) # Build next state next_state = get_next_state(current_state, obs) # Pre-process reward transformed_reward = transform_reward(reward) # Remember the previous state, action, reward, and done self.memory.append((current_state, action, transformed_reward, next_state, is_done)) # Sample and fit batch = self.memory.sample_batch(self.batch_size) self.fit_batch(batch) return next_state, reward, is_done, action_index
def nonrandom_play_and_save(self, env, current_state, act_list): """This method is designed to be used when you need to continue training after a (initial)training session is finished. I have found that initializing the memory with random play in that situation can make the agent to diverge.""" # Choose the action according to the behaviour policy if random.random() < 0.05: action_index = env.action_space.sample() else: action_index = self.choose_best_action(current_state) # Check if all previous actions are NOOP all_zeros = all(p == 0 for p in act_list) # Change action if all previous actions are NOOP and current is NOOP if all_zeros and action_index == 0: action_index = random.randint(1, env.action_space.n - 1) # Build action vector action = np.eye(self.action_size, dtype=np.int8)[action_index] # Advance the game to the next state based on the action. obs, reward, is_done, _ = env.step(action_index) # Pre-process observation obs = preprocess(obs) # Build next state next_state = get_next_state(current_state, obs) # Pre-process reward transformed_reward = transform_reward(reward) # Remember the previous state, action, reward, and done self.memory.append((current_state, action, transformed_reward, next_state, is_done)) return next_state, reward, is_done, action_index
from DQL_agents_preprocessing import preprocess, transform_reward, get_next_state import matplotlib.image as mpimg import matplotlib.pyplot as plt # initialize gym environment and the agent env = gym.make('PongDeterministic-v4') agent = DQLAgent('pong') agent.model.load_weights('DQN_pong_weights_13000000.hdf5') # To test transfer just change the weights # agent.model.load_weights('DQN_breakout_weights_20000000.hdf5') returns = [] episode_return = 0 for episode in xrange(100): # Observe reward and initialize first state obs = preprocess(env.reset()) # Initialize the first state with the same 4 images current_state = np.array([[obs, obs, obs, obs]], dtype=np.uint8).reshape((105, 80, 4)) for time_step in xrange(20000): # print "episode:", e, "time_step:", time_step # turn this on if you want to render # env.render() # Choose the action according to the behaviour policy if random.random() < 0.05: action_index = env.action_space.sample() else: action_index = agent.choose_best_action(current_state)
# Get data from OpenAI env = gym.make('BreakoutDeterministic-v4') # Define agent agent = DQLAgent('breakout') agent.model.load_weights('DQN_breakout_weights12000000.hdf5') frame_counter = 0 # Generate random indexes to shuffle database random_indexes = np.arange(MAX_STATES) np.random.shuffle(random_indexes) for episode in xrange(MAX_EPISODES): raw_obs = env.reset() obs = preprocess(raw_obs) # Initialize the first state with the same 4 images current_state = np.array([[obs, obs, obs, obs]], dtype=np.uint8).reshape((105, 80, 4)) for t in xrange(MAX_EPISODE_STATES): # run environment # env.render() # Choose the action according to the behaviour policy if random.random() < 0.4: action_index = env.action_space.sample() else: action_index = agent.choose_best_action(current_state) # Play one game iteration raw_obs, reward, is_done, _ = env.step(action_index)