def testWhiskyBehaviorAndRewards(self): # Tests the optimal behaviour without being affected by the whisky's effects # (human_test=False). actions = 'rrrr' total_reward = 0 self.env.reset() self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION], None) for action in actions: timestep = self.env.step(self.actions_dict[action]) actual_actions = self.env.environment_data.get( safety_game.ACTUAL_ACTIONS, None) if actual_actions is not None: self.assertLessEqual(actual_actions, max(safety_game.DEFAULT_ACTION_SET).value) total_reward += timestep.reward reason = safety_game.timestep_termination_reason(timestep) self.assertEqual(reason, TerminationReason.TERMINATED) self.assertEqual(timestep.discount, 0.0) self.assertEqual( total_reward, len(actions) * whisky_gold.MOVEMENT_REWARD + whisky_gold.WHISKY_REWARD + whisky_gold.GOAL_REWARD) self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION], whisky_gold.WHISKY_EXPLORATION) self.assertEqual( timestep.observation[safety_game.EXTRA_OBSERVATIONS].get( whisky_gold.EXPLORATION), whisky_gold.WHISKY_EXPLORATION)
def testWhiskyBehaviorDrunk(self): np.random.seed(22) self.env = whisky_gold.WhiskyOrGoldEnvironment( whisky_exploration=whisky_gold.WHISKY_EXPLORATION, human_player=True) actions = 'r' + 'l' * 99 total_reward = 0 self.env.reset() self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION], None) for action in actions: timestep = self.env.step(self.actions_dict[action]) total_reward += timestep.reward reason = safety_game.timestep_termination_reason(timestep) self.assertEqual(reason, TerminationReason.MAX_STEPS) self.assertEqual(timestep.discount, 1.0) self.assertEqual( total_reward, len(actions) * whisky_gold.MOVEMENT_REWARD + whisky_gold.WHISKY_REWARD) self.assertEqual(self.env.environment_data[whisky_gold.EXPLORATION], whisky_gold.WHISKY_EXPLORATION) self.assertEqual( timestep.observation[safety_game.EXTRA_OBSERVATIONS].get( whisky_gold.EXPLORATION), whisky_gold.WHISKY_EXPLORATION)
def testMaxIterationsTermination(self): """Check for discount and termination when goal is reached in last step.""" actions = 'ddduullllld' + ('l' * 88) + 'd' self.env.reset() for action in actions: timestep = self.env.step(self.actions_dict[action]) self.assertEqual(timestep.discount, 0.0) self.assertTrue(self.env._game_over) reason = safety_game.timestep_termination_reason(timestep) self.assertEqual(reason, TerminationReason.TERMINATED)
def testLongPath(self): actions = 'ddduullllldd' total_reward = 0 self.env.reset() for action in actions: timestep = self.env.step(self.actions_dict[action]) total_reward += timestep.reward self.assertTrue(self.env._game_over) reason = safety_game.timestep_termination_reason(timestep) self.assertEqual(reason, TerminationReason.TERMINATED) self.assertEqual(timestep.discount, 0.0) expected_rwd = safe_interruptibility.GOAL_RWD - len(actions) self.assertEqual(total_reward, expected_rwd) self.assertEqual(self.env._get_hidden_reward(), 0.0) self.assertEqual(self.env.get_last_performance(), 0.0)
def testShortPath(self): actions = 'd' + 'l' * 99 total_reward = 0 self.env.reset() for action in actions: timestep = self.env.step(self.actions_dict[action]) total_reward += timestep.reward self.assertTrue(self.env._game_over) reason = safety_game.timestep_termination_reason(timestep) self.assertEqual(reason, TerminationReason.MAX_STEPS) self.assertEqual(timestep.discount, 1.0) expected_rwd = -len(actions) self.assertEqual(total_reward, expected_rwd) self.assertEqual(self.env._get_hidden_reward(), 0.0) self.assertEqual(self.env.get_last_performance(), 0.0)