def setUp(self): self.env = PacmanDetEnv() self.model = BasicMctsModel(self.env, discount=.9, max_depth=50) self.policy = MctsPolicy(self.env, self.model, num_simulations=50, discount=.9)
def setUp(self): self.env = FrozenLakeDetEnv() self.model = BasicMctsModel(self.env, discount=1.) self.policy = MctsPolicy(self.env, self.model, num_simulations=100, discount=1.)
class MctsPolicyFrozenLakeTest(unittest.TestCase): def setUp(self): self.env = FrozenLakeDetEnv() self.model = BasicMctsModel(self.env, discount=1.) self.policy = MctsPolicy(self.env, self.model, num_simulations=100, discount=1.) def test_second_to_final(self): # Move to the left and up of Goal state. self.env.set_states([RIGHT, RIGHT, DOWN, DOWN]) logits = self.policy.get_policy_logits() # RIGHT is also ok. self.assertEqual(DOWN, self.policy.choose_action(logits)) self.assertEqual(DOWN, self.policy.action()) def test_final(self): # Move to the left of Goal state. self.env.set_states([RIGHT, RIGHT, DOWN, DOWN, DOWN]) self.assertEqual(RIGHT, self.policy.action()) def test_game_deterministic(self): while range(100): action = self.policy.action() states, is_final, reward = self.env.step(action) if is_final: break self.assertEqual(1.0, reward)
def play_game_once(self, r_seed): self.env = TicTacToeEnv(use_random=True, r_seed=r_seed) self.model = BasicMctsModel(self.env, r_seed=r_seed) self.policy = MctsPolicy(self.env, self.model, num_simulations=100, r_seed=r_seed) while True: action = self.policy.action() states_isfinal_reward = self.env.step(action) states, is_final, reward = states_isfinal_reward if is_final: return states, is_final, reward
class MctsPolicyFrozenLakeTest(unittest.TestCase): # RUN NOTES: (max_depth, num_simulations) and compute time. # 25, 25 worked pretty well, ~.5s per step # 10, 10 pretty fast, ~.1s per step, 1.0 discount achieves ~2200 score. def setUp(self): self.env = PacmanDetEnv() self.model = BasicMctsModel(self.env, discount=.9, max_depth=50) self.policy = MctsPolicy(self.env, self.model, num_simulations=50, discount=.9) # def test_second_to_final(self): # # Move to the left and up of Goal state. # self.env.set_states([RIGHT, RIGHT, DOWN, DOWN]) # logits = self.policy.get_policy_logits() # # RIGHT is also ok. # self.assertEqual(DOWN, self.policy.choose_action(logits)) # self.assertEqual(DOWN, self.policy.action()) # def test_final(self): # # Move to the left of Goal state. # self.env.set_states([RIGHT, RIGHT, DOWN, DOWN, DOWN]) # self.assertEqual(RIGHT, self.policy.action()) def test_game_deterministic(self): idx = 0 total_reward = 0 while True: start_time = time.time() print('Starting action calculation') action = self.policy.action() end_time = time.time() states, is_final, reward = self.env.step(action) total_reward += reward print('Action at iter %s: %s\nReward: %s\n' 'TotalReward: %s\nCalc time: %s\n\n' % (idx, action, reward, total_reward, end_time - start_time)) self.env.env.render() if is_final: print("Hit is_final!") break idx += 1 # for 10, 10 got over 2200 with discount 1 # with discount 9, seems to not die, but runs longer, and pacman # cuts off the game at iter 900 self.assertTrue(total_reward > 2200)
class MctsPolicyTicTacToeTest(unittest.TestCase): def setUp(self): self.env = TicTacToeEnv() self.model = BasicMctsModel(self.env) self.policy = MctsPolicy(self.env, self.model, num_simulations=100) def test_action_start(self): action = self.policy.action() states_isfinal_reward = self.env.step(action) self.assertEqual(0, action) self.assertEqual(([1, 4, 0, 0, 0, 0, 0, 0, 0], False, 0.0), states_isfinal_reward) def test_action_win(self): self.env.set_states([1, 0, 1, 1, 0, 4, 4, 4, 0]) action = self.policy.action() states_isfinal_reward = self.env.step(action) self.assertEqual(1, action) self.assertEqual(([1, 1, 1, 1, 0, 4, 4, 4, 0], True, 1.0), states_isfinal_reward) def test_action_win_2(self): self.env.set_states([1, 1, 4, 0, 0, 4, 1, 4, 0]) action = self.policy.action() states_isfinal_reward = self.env.step(action) self.assertEqual(3, action) self.assertEqual(([1, 1, 4, 1, 0, 4, 1, 4, 0], True, 1.0), states_isfinal_reward) def test_policy_logits(self): logits = self.policy.get_policy_logits() tf.assert_equal( tf.constant([0.14, 0.09, 0.13, 0.09, 0.13, 0.11, 0.09, 0.11, 0.11], dtype=tf.float64), logits) def test_choose_action(self): self.assertEqual( 1, self.policy.choose_action( tf.constant([ 0.11, 0.116, 0.11, 0.11, 0.11, 0.111, 0.111, 0.111, 0.111 ]))) def test_game_deterministic(self): while True: action = self.policy.action() states_isfinal_reward = self.env.step(action) states, is_final, reward = states_isfinal_reward if is_final: break self.assertEqual(1.0, reward) def play_game_once(self, r_seed): self.env = TicTacToeEnv(use_random=True, r_seed=r_seed) self.model = BasicMctsModel(self.env, r_seed=r_seed) self.policy = MctsPolicy(self.env, self.model, num_simulations=100, r_seed=r_seed) while True: action = self.policy.action() states_isfinal_reward = self.env.step(action) states, is_final, reward = states_isfinal_reward if is_final: return states, is_final, reward def test_game_random(self): reward_dict = collections.defaultdict(int) for r_seed in range(100): _, _, reward = self.play_game_once(r_seed) reward_dict[reward] += 1 print('reward distribution: ', reward_dict) # 96% winning ratio. self.assertEqual({1.0: 96, 0.0: 1, -1.0: 3}, reward_dict)
def setUp(self): self.env = TicTacToeEnv() self.model = BasicMctsModel(self.env) self.policy = MctsPolicy(self.env, self.model, num_simulations=100)