class TestReplay(unittest.TestCase): def setUp(self): ''' Create class instance and Gym environment instance ''' self.memory = Replay(4) self.env = gym.make('CartPole-v0') def test_burn_memory(self): ''' Test to check burn_memory functionality ''' self.memory.burn_memory(self.env, 2) self.assertEqual(len(self.memory.store), 2) def test_replace(self): ''' Test to check replacement of old transition tuples after crossing capacity ''' self.memory.burn_memory(self.env, 2) state = self.env.reset() for _ in range(4): random_action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(random_action) self.memory.add_to_memory((next_state, reward, state, done)) if done: state = self.env.reset() else: state = next_state self.assertEqual(len(self.memory.store), self.memory.capacity) def test_sample(self): ''' Test to check sampling function of replay memory ''' self.memory.burn_memory(self.env, 3) batch = self.memory.sample_from_memory(2) self.assertEqual(len(batch), 2)
class DQN_Agent: def __init__(self, parameters): # Gym environment parameters self.env_name = parameters.environment_name self.env = gym.make(self.env_name) self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.n # Training parameters self.discount = Training_parameters.discount self.train_episodes = parameters.train_episodes self.test_episodes = Training_parameters.test_episodes self.test_frequency = Training_parameters.test_frequency self.render_decision = parameters.render_decision self.render_frequency = Training_parameters.render_frequency # Replay memory parameters self.memory = Replay() self.memory.burn_memory(self.env) # Q-networks parameters self.Q_net = Network(self.state_dim, self.action_dim, Network_parameters.Q_net_var_scope, parameters.duel) self.target_Q_net = Network(self.state_dim, self.action_dim, Network_parameters.target_Q_net_var_scope, parameters.duel) self.update_target_frequency = Training_parameters.update_target_frequency self.double = parameters.double def epsilon_greedy_policy(self, q_values, epsilon=0.05): """ Returns action as per epsilon-greedy policy :param q_values: Q-values for the possible actions :param epsilon: Parameter to define exploratory action probability :return: action: Action selected by agent as per epsilon-greedy policy """ if random.random() < epsilon: return self.env.action_space.sample() else: return self.greedy_policy(q_values) def greedy_policy(self, q_values): ''' Returns action as per greedy policy Parameters: q_values: Q-values for the possible actions Output: Action selected by agent as per greedy policy corresponding to maximum Q-value ''' return np.argmax(q_values) def train(self): performance = [] # Setup video rendering for Gym environment if self.render_decision: f = lambda X: X % self.render_frequency == 0 self.env.render() video_save_path = f'{Directories.output}Video_DQN_{self.env_name}/' self.env = gym.wrappers.Monitor(self.env, video_save_path, video_callable=f, force=True) self.env.reset() for episode in range(self.train_episodes): state = self.env.reset() done = False while not done: # Perform an action in environment and add to replay memory Q_values = self.Q_net.predict(state.reshape(-1, self.state_dim)) # Anneal exploration probability epsilon epsilon = Training_parameters.inital_eps - (Training_parameters.scale_eps * (Training_parameters.inital_eps - Training_parameters.final_eps) * (episode / self.train_episodes)) action = self.epsilon_greedy_policy(Q_values, epsilon) next_state, reward, done, _ = self.env.step(action) self.memory.add_to_memory((next_state, reward, state, action, done)) # Sample batch from memory and train model batch = self.memory.sample_from_memory() batch_next_state, batch_reward, batch_state, batch_action, check_if_terminal = map(np.array, zip(*batch)) check_if_not_terminal = np.invert(check_if_terminal) if self.double: Q_next = self.Q_net.predict(batch_next_state.reshape(-1, self.state_dim)) next_actions = np.argmax(Q_next, axis=1) next_actions_indices = np.vstack([np.arange(Network_parameters.batch_size), next_actions]).T target_Q_next_all_actions = self.target_Q_net.predict(batch_next_state.reshape(-1, self.state_dim)) targets = batch_reward + check_if_not_terminal * self.discount *tf.gather_nd(target_Q_next_all_actions, next_actions_indices) else: target_Q_next = self.target_Q_net.predict(batch_next_state.reshape(-1, self.state_dim)) targets = batch_reward + check_if_not_terminal*self.discount * np.max(target_Q_next, axis=1) actions_selected = np.vstack([np.arange(Network_parameters.batch_size), batch_action]).T self.Q_net.fit(batch_state, targets, actions_selected) # Update target model as per update frequency if episode % self.update_target_frequency == 0: self.Q_net.update_target_model(self.target_Q_net) # Test policy as per test frequency if episode % self.test_frequency == 0: test_rewards, test_std = self.test() print(f'After {episode} episodes, mean test reward is {test_rewards} with std of {test_std}') performance.append((test_rewards, test_std)) return performance def test(self): rewards = [] for test_episode in range(self.test_episodes): curr_episode_reward = 0 state = self.env.reset() done = False while not done: action = self.greedy_policy(self.Q_net.predict(state.reshape(1, -1))) next_state, reward, done, _ = self.env.step(action) curr_episode_reward += reward if done: state = self.env.reset() else: state = next_state rewards.append(curr_episode_reward) rewards = np.array(rewards) return np.mean(rewards), np.std(rewards)