def test_replay_start_size(self, mock_parameters):
        self._setup_parameters(mock_parameters.return_value)
        # Set exploration rate to 0
        mock_parameters.return_value.initial_epsilon = 0
        mock_parameters.return_value.epsilon_decay_step_count = 100
        mock_parameters.return_value.epsilon_minimum = 0
        mock_parameters.return_value.replay_start_size = 3

        action_space = spaces.Discrete(2)
        observation_space = spaces.Box(0, 1, (1,))
        sut = QLearning('', observation_space, action_space)
        sut._trainer = MagicMock()
        sut._replay_memory = MagicMock()

        _, debug = sut.start(np.array([0.1], np.float32))
        self.assertEqual(sut.step_count, 0)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
        self.assertEqual(debug['action_behavior'], 'RANDOM')

        _, debug = sut.step(0.1, np.array([0.2], np.float32))
        self.assertEqual(sut.step_count, 1)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
        self.assertEqual(debug['action_behavior'], 'RANDOM')

        sut.end(0.2, np.array([0.3], np.float32))
        self.assertEqual(sut.step_count, 2)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)

        _, debug = sut.start(np.array([0.4], np.float32))
        self.assertEqual(sut.step_count, 2)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
        self.assertEqual(debug['action_behavior'], 'RANDOM')

        a, debug = sut.step(0.3, np.array([0.5], np.float32))
        self.assertEqual(sut.step_count, 3)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
        self.assertEqual(debug['action_behavior'], 'GREEDY')

        a, debug = sut.start(np.array([0.6], np.float32))
        self.assertEqual(sut.step_count, 3)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
        self.assertEqual(debug['action_behavior'], 'GREEDY')

        a, debug = sut.step(0.4, np.array([0.7], np.float32))
        self.assertEqual(sut.step_count, 4)
        self.assertEqual(sut._trainer.train_minibatch.call_count, 1)
        self.assertEqual(debug['action_behavior'], 'GREEDY')
Example #2
0
    def test_update_q(self, mock_parameters, mock_replay_memory):
        """Test if _update_q_periodically() can finish successfully."""
        self._setup_parameters(mock_parameters.return_value)
        self._setup_replay_memory(mock_replay_memory.return_value)

        action_space = spaces.Discrete(2)
        observation_space = spaces.Box(0, 1, (1, ))
        sut = QLearning('', observation_space, action_space)
        sut._trainer.train_minibatch = MagicMock()
        sut._choose_action = MagicMock(side_effect=[
            (1, 'GREEDY'),
            (0, 'GREEDY'),
            (1, 'RANDOM'),
        ])

        action, debug_info = sut.start(np.array([0.1], np.float32))
        self.assertEqual(action, 1)
        self.assertEqual(debug_info['action_behavior'], 'GREEDY')
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 0)
        self.assertEqual(sut._epsilon, 0.1)
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(),
                         0.1)
        self.assertEqual(sut._last_state, np.array([0.1], np.float32))
        self.assertEqual(sut._last_action, 1)

        action, debug_info = sut.step(1, np.array([0.2], np.float32))
        self.assertEqual(action, 0)
        self.assertEqual(debug_info['action_behavior'], 'GREEDY')
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 1)
        self.assertEqual(sut._epsilon, 0.09)
        # learning rate remains 0.1 as Q is not updated during this time step.
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(),
                         0.1)
        self.assertEqual(sut._last_state, np.array([0.2], np.float32))
        self.assertEqual(sut._last_action, 0)

        action, debug_info = sut.step(2, np.array([0.3], np.float32))
        self.assertEqual(action, 1)
        self.assertEqual(debug_info['action_behavior'], 'RANDOM')
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 2)
        self.assertEqual(sut._epsilon, 0.08)
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(),
                         0.08)
        self.assertEqual(sut._last_state, np.array([0.3], np.float32))
        self.assertEqual(sut._last_action, 1)

        sut.end(3, np.array([0.4], np.float32))
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 3)
        self.assertEqual(sut._epsilon, 0.08)
        # learning rate remains 0.08 as Q is not updated during this time step.
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(),
                         0.08)
    def test_populate_replay_memory(self, mock_parameters):
        self._setup_parameters(mock_parameters.return_value)
        mock_parameters.return_value.preprocessing = \
            'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow'
        mock_parameters.return_value.preprocessing_args = '(2, )'

        action_space = spaces.Discrete(2)
        observation_space = spaces.Box(0, 1, (1,))
        sut = QLearning('', observation_space, action_space)

        sut._compute_priority = Mock(side_effect=[1, 2, 3])
        sut._choose_action = Mock(
            side_effect=[(0, ''), (0, ''), (1, ''), (1, '')])
        sut._replay_memory = MagicMock()
        sut._update_q_periodically = MagicMock()

        sut.start(np.array([0.1], np.float32))
        sut.step(0.1, np.array([0.2], np.float32))
        sut.step(0.2, np.array([0.3], np.float32))
        sut.end(0.3, np.array([0.4], np.float32))

        self.assertEqual(sut._replay_memory.store.call_count, 3)

        call_args = sut._replay_memory.store.call_args_list[0]
        np.testing.assert_array_equal(
            call_args[0][0],
            np.array([[0], [0.1]], np.float32))
        self.assertEqual(call_args[0][1], 0)
        self.assertEqual(call_args[0][2], 0.1)
        np.testing.assert_array_equal(
            call_args[0][3],
            np.array([[0.1], [0.2]], np.float32))
        self.assertEqual(call_args[0][4], 1)

        call_args = sut._replay_memory.store.call_args_list[2]
        np.testing.assert_array_equal(
            call_args[0][0],
            np.array([[0.2], [0.3]], np.float32))
        self.assertEqual(call_args[0][1], 1)
        self.assertEqual(call_args[0][2], 0.3)
        self.assertIsNone(call_args[0][3])
        self.assertEqual(call_args[0][4], 3)
Example #4
0
    def test_update_q(self,
                      mock_parameters,
                      mock_replay_memory):
        """Test if _update_q_periodically() can finish successfully."""
        self._setup_parameters(mock_parameters.return_value)
        self._setup_replay_memory(mock_replay_memory.return_value)

        action_space = spaces.Discrete(2)
        observation_space = spaces.Box(0, 1, (1,))
        sut = QLearning('', observation_space, action_space)
        sut._trainer.train_minibatch = MagicMock()
        sut._choose_action = MagicMock(side_effect=[
            (1, 'GREEDY'),
            (0, 'GREEDY'),
            (1, 'RANDOM'),
        ])

        action, debug_info = sut.start(np.array([0.1], np.float32))
        self.assertEqual(action, 1)
        self.assertEqual(debug_info['action_behavior'], 'GREEDY')
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 0)
        self.assertEqual(sut._epsilon, 0.1)
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.1)
        self.assertEqual(sut._last_state, np.array([0.1], np.float32))
        self.assertEqual(sut._last_action, 1)

        action, debug_info = sut.step(1, np.array([0.2], np.float32))
        self.assertEqual(action, 0)
        self.assertEqual(debug_info['action_behavior'], 'GREEDY')
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 1)
        self.assertEqual(sut._epsilon, 0.09)
        # learning rate remains 0.1 as Q is not updated during this time step.
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.1)
        self.assertEqual(sut._last_state, np.array([0.2], np.float32))
        self.assertEqual(sut._last_action, 0)

        action, debug_info = sut.step(2, np.array([0.3], np.float32))
        self.assertEqual(action, 1)
        self.assertEqual(debug_info['action_behavior'], 'RANDOM')
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 2)
        self.assertEqual(sut._epsilon, 0.08)
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.08)
        self.assertEqual(sut._last_state, np.array([0.3], np.float32))
        self.assertEqual(sut._last_action, 1)

        sut.end(3, np.array([0.4], np.float32))
        self.assertEqual(sut.episode_count, 1)
        self.assertEqual(sut.step_count, 3)
        self.assertEqual(sut._epsilon, 0.08)
        # learning rate remains 0.08 as Q is not updated during this time step.
        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.08)