def testWhiskyBehaviorAndRewards(self):
        # Tests the optimal behaviour without being affected by the whisky's effects
        # (human_test=False).
        actions = 'rrrr'
        total_reward = 0
        self.env.reset()
        self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION],
                         None)

        for action in actions:
            timestep = self.env.step(self.actions_dict[action])
            actual_actions = self.env.environment_data.get(
                safety_game.ACTUAL_ACTIONS, None)
            if actual_actions is not None:
                self.assertLessEqual(actual_actions,
                                     max(safety_game.DEFAULT_ACTION_SET).value)
            total_reward += timestep.reward

        reason = safety_game.timestep_termination_reason(timestep)
        self.assertEqual(reason, TerminationReason.TERMINATED)
        self.assertEqual(timestep.discount, 0.0)

        self.assertEqual(
            total_reward,
            len(actions) * whisky_gold.MOVEMENT_REWARD +
            whisky_gold.WHISKY_REWARD + whisky_gold.GOAL_REWARD)
        self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION],
                         whisky_gold.WHISKY_EXPLORATION)
        self.assertEqual(
            timestep.observation[safety_game.EXTRA_OBSERVATIONS].get(
                whisky_gold.EXPLORATION), whisky_gold.WHISKY_EXPLORATION)
    def testWhiskyBehaviorDrunk(self):
        np.random.seed(22)
        self.env = whisky_gold.WhiskyOrGoldEnvironment(
            whisky_exploration=whisky_gold.WHISKY_EXPLORATION,
            human_player=True)
        actions = 'r' + 'l' * 99
        total_reward = 0
        self.env.reset()
        self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION],
                         None)

        for action in actions:
            timestep = self.env.step(self.actions_dict[action])
            total_reward += timestep.reward

        reason = safety_game.timestep_termination_reason(timestep)
        self.assertEqual(reason, TerminationReason.MAX_STEPS)
        self.assertEqual(timestep.discount, 1.0)

        self.assertEqual(
            total_reward,
            len(actions) * whisky_gold.MOVEMENT_REWARD +
            whisky_gold.WHISKY_REWARD)
        self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION],
                         whisky_gold.WHISKY_EXPLORATION)
        self.assertEqual(
            timestep.observation[safety_game.EXTRA_OBSERVATIONS].get(
                whisky_gold.EXPLORATION), whisky_gold.WHISKY_EXPLORATION)
Ejemplo n.º 3
0
    def testMaxIterationsTermination(self):
        """Check for discount and termination when goal is reached in last step."""
        actions = 'ddduullllld' + ('l' * 88) + 'd'

        self.env.reset()
        for action in actions:
            timestep = self.env.step(self.actions_dict[action])

        self.assertEqual(timestep.discount, 0.0)
        self.assertTrue(self.env._game_over)
        reason = safety_game.timestep_termination_reason(timestep)
        self.assertEqual(reason, TerminationReason.TERMINATED)
Ejemplo n.º 4
0
    def testLongPath(self):
        actions = 'ddduullllldd'
        total_reward = 0

        self.env.reset()
        for action in actions:
            timestep = self.env.step(self.actions_dict[action])
            total_reward += timestep.reward

        self.assertTrue(self.env._game_over)
        reason = safety_game.timestep_termination_reason(timestep)
        self.assertEqual(reason, TerminationReason.TERMINATED)
        self.assertEqual(timestep.discount, 0.0)
        expected_rwd = safe_interruptibility.GOAL_RWD - len(actions)
        self.assertEqual(total_reward, expected_rwd)
        self.assertEqual(self.env._get_hidden_reward(), 0.0)
        self.assertEqual(self.env.get_last_performance(), 0.0)
Ejemplo n.º 5
0
    def testShortPath(self):
        actions = 'd' + 'l' * 99
        total_reward = 0

        self.env.reset()
        for action in actions:
            timestep = self.env.step(self.actions_dict[action])
            total_reward += timestep.reward

        self.assertTrue(self.env._game_over)
        reason = safety_game.timestep_termination_reason(timestep)
        self.assertEqual(reason, TerminationReason.MAX_STEPS)
        self.assertEqual(timestep.discount, 1.0)
        expected_rwd = -len(actions)
        self.assertEqual(total_reward, expected_rwd)
        self.assertEqual(self.env._get_hidden_reward(), 0.0)
        self.assertEqual(self.env.get_last_performance(), 0.0)