コード例 #1
0
ファイル: test.py プロジェクト: kshitij-ingale/Deep_Q_net
class TestReplay(unittest.TestCase):
    def setUp(self):
        ''' Create class instance and Gym environment instance '''
        self.memory = Replay(4)
        self.env = gym.make('CartPole-v0')

    def test_burn_memory(self):
        ''' Test to check burn_memory functionality '''
        self.memory.burn_memory(self.env, 2)
        self.assertEqual(len(self.memory.store), 2)

    def test_replace(self):
        ''' Test to check replacement of old transition tuples after crossing capacity '''
        self.memory.burn_memory(self.env, 2)
        state = self.env.reset()
        for _ in range(4):
            random_action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(random_action)
            self.memory.add_to_memory((next_state, reward, state, done))
            if done:
                state = self.env.reset()
            else:
                state = next_state
        self.assertEqual(len(self.memory.store), self.memory.capacity)

    def test_sample(self):
        ''' Test to check sampling function of replay memory '''
        self.memory.burn_memory(self.env, 3)
        batch = self.memory.sample_from_memory(2)
        self.assertEqual(len(batch), 2)
コード例 #2
0
class DQN_Agent:

    def __init__(self, parameters):
        # Gym environment parameters
        self.env_name = parameters.environment_name
        self.env = gym.make(self.env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        # Training parameters
        self.discount = Training_parameters.discount
        self.train_episodes = parameters.train_episodes
        self.test_episodes = Training_parameters.test_episodes
        self.test_frequency = Training_parameters.test_frequency
        self.render_decision = parameters.render_decision
        self.render_frequency = Training_parameters.render_frequency
        # Replay memory parameters
        self.memory = Replay()
        self.memory.burn_memory(self.env)
        # Q-networks parameters
        self.Q_net = Network(self.state_dim, self.action_dim, Network_parameters.Q_net_var_scope, parameters.duel)
        self.target_Q_net = Network(self.state_dim, self.action_dim, Network_parameters.target_Q_net_var_scope, parameters.duel)
        self.update_target_frequency = Training_parameters.update_target_frequency
        self.double = parameters.double

    def epsilon_greedy_policy(self, q_values, epsilon=0.05):
        """
        Returns action as per epsilon-greedy policy
        :param q_values: Q-values for the possible actions
        :param epsilon: Parameter to define exploratory action probability
        :return: action: Action selected by agent as per epsilon-greedy policy
        """
        if random.random() < epsilon:
            return self.env.action_space.sample()
        else:
            return self.greedy_policy(q_values)

    def greedy_policy(self, q_values):
        '''
        Returns action as per greedy policy

        Parameters:
        q_values: Q-values for the possible actions

        Output:
        Action selected by agent as per greedy policy corresponding to maximum Q-value
        '''
        return np.argmax(q_values)

    def train(self):
        performance = []
        # Setup video rendering for Gym environment
        if self.render_decision:
            f = lambda X: X % self.render_frequency == 0
            self.env.render()
            video_save_path = f'{Directories.output}Video_DQN_{self.env_name}/'
            self.env = gym.wrappers.Monitor(self.env, video_save_path, video_callable=f, force=True)
            self.env.reset()

        for episode in range(self.train_episodes):
            state = self.env.reset()
            done = False
            while not done:
                # Perform an action in environment and add to replay memory
                Q_values = self.Q_net.predict(state.reshape(-1, self.state_dim))
                # Anneal exploration probability epsilon
                epsilon = Training_parameters.inital_eps - (Training_parameters.scale_eps * (Training_parameters.inital_eps - Training_parameters.final_eps) * (episode / self.train_episodes))
                action = self.epsilon_greedy_policy(Q_values, epsilon)
                next_state, reward, done, _ = self.env.step(action)
                self.memory.add_to_memory((next_state, reward, state, action, done))

                # Sample batch from memory and train model
                batch = self.memory.sample_from_memory()
                batch_next_state, batch_reward, batch_state, batch_action, check_if_terminal = map(np.array,
                                                                                                   zip(*batch))
                check_if_not_terminal = np.invert(check_if_terminal)
                if self.double:
                    Q_next = self.Q_net.predict(batch_next_state.reshape(-1, self.state_dim))
                    next_actions = np.argmax(Q_next, axis=1)
                    next_actions_indices = np.vstack([np.arange(Network_parameters.batch_size), next_actions]).T
                    target_Q_next_all_actions = self.target_Q_net.predict(batch_next_state.reshape(-1, self.state_dim))
                    targets = batch_reward + check_if_not_terminal * self.discount *tf.gather_nd(target_Q_next_all_actions, next_actions_indices)
                else:
                    target_Q_next = self.target_Q_net.predict(batch_next_state.reshape(-1, self.state_dim))
                    targets = batch_reward + check_if_not_terminal*self.discount * np.max(target_Q_next, axis=1)
                actions_selected = np.vstack([np.arange(Network_parameters.batch_size), batch_action]).T
                self.Q_net.fit(batch_state, targets, actions_selected)

            # Update target model as per update frequency
            if episode % self.update_target_frequency == 0:
                self.Q_net.update_target_model(self.target_Q_net)

            # Test policy as per test frequency
            if episode % self.test_frequency == 0:
                test_rewards, test_std = self.test()
                print(f'After {episode} episodes, mean test reward is {test_rewards} with std of {test_std}')
                performance.append((test_rewards, test_std))
        return performance

    def test(self):
        rewards = []
        for test_episode in range(self.test_episodes):
            curr_episode_reward = 0
            state = self.env.reset()
            done = False
            while not done:
                action = self.greedy_policy(self.Q_net.predict(state.reshape(1, -1)))
                next_state, reward, done, _ = self.env.step(action)
                curr_episode_reward += reward
                if done:
                    state = self.env.reset()
                else:
                    state = next_state
            rewards.append(curr_episode_reward)
        rewards = np.array(rewards)
        return np.mean(rewards), np.std(rewards)