コード例 #1
0
 def setUp(self):
     self.env = PacmanDetEnv()
     self.model = BasicMctsModel(self.env, discount=.9, max_depth=50)
     self.policy = MctsPolicy(self.env,
                              self.model,
                              num_simulations=50,
                              discount=.9)
コード例 #2
0
 def setUp(self):
     self.env = FrozenLakeDetEnv()
     self.model = BasicMctsModel(self.env, discount=1.)
     self.policy = MctsPolicy(self.env,
                              self.model,
                              num_simulations=100,
                              discount=1.)
コード例 #3
0
class MctsPolicyFrozenLakeTest(unittest.TestCase):
    def setUp(self):
        self.env = FrozenLakeDetEnv()
        self.model = BasicMctsModel(self.env, discount=1.)
        self.policy = MctsPolicy(self.env,
                                 self.model,
                                 num_simulations=100,
                                 discount=1.)

    def test_second_to_final(self):
        # Move to the left and up of Goal state.
        self.env.set_states([RIGHT, RIGHT, DOWN, DOWN])
        logits = self.policy.get_policy_logits()
        # RIGHT is also ok.
        self.assertEqual(DOWN, self.policy.choose_action(logits))
        self.assertEqual(DOWN, self.policy.action())

    def test_final(self):
        # Move to the left of Goal state.
        self.env.set_states([RIGHT, RIGHT, DOWN, DOWN, DOWN])
        self.assertEqual(RIGHT, self.policy.action())

    def test_game_deterministic(self):
        while range(100):
            action = self.policy.action()
            states, is_final, reward = self.env.step(action)
            if is_final:
                break
        self.assertEqual(1.0, reward)
コード例 #4
0
 def play_game_once(self, r_seed):
     self.env = TicTacToeEnv(use_random=True, r_seed=r_seed)
     self.model = BasicMctsModel(self.env, r_seed=r_seed)
     self.policy = MctsPolicy(self.env,
                              self.model,
                              num_simulations=100,
                              r_seed=r_seed)
     while True:
         action = self.policy.action()
         states_isfinal_reward = self.env.step(action)
         states, is_final, reward = states_isfinal_reward
         if is_final:
             return states, is_final, reward
コード例 #5
0
class MctsPolicyFrozenLakeTest(unittest.TestCase):

    # RUN NOTES: (max_depth, num_simulations) and compute time.
    # 25, 25 worked pretty well, ~.5s per step
    # 10, 10 pretty fast, ~.1s per step, 1.0 discount achieves ~2200 score.
    def setUp(self):
        self.env = PacmanDetEnv()
        self.model = BasicMctsModel(self.env, discount=.9, max_depth=50)
        self.policy = MctsPolicy(self.env,
                                 self.model,
                                 num_simulations=50,
                                 discount=.9)

    # def test_second_to_final(self):
    #     # Move to the left and up of Goal state.
    #     self.env.set_states([RIGHT, RIGHT, DOWN, DOWN])
    #     logits = self.policy.get_policy_logits()
    #     # RIGHT is also ok.
    #     self.assertEqual(DOWN, self.policy.choose_action(logits))
    #     self.assertEqual(DOWN, self.policy.action())

    # def test_final(self):
    #     # Move to the left of Goal state.
    #     self.env.set_states([RIGHT, RIGHT, DOWN, DOWN, DOWN])
    #     self.assertEqual(RIGHT, self.policy.action())

    def test_game_deterministic(self):
        idx = 0
        total_reward = 0
        while True:
            start_time = time.time()
            print('Starting action calculation')
            action = self.policy.action()
            end_time = time.time()
            states, is_final, reward = self.env.step(action)
            total_reward += reward
            print('Action at iter %s: %s\nReward: %s\n'
                  'TotalReward: %s\nCalc time: %s\n\n' %
                  (idx, action, reward, total_reward, end_time - start_time))
            self.env.env.render()
            if is_final:
                print("Hit is_final!")
                break
            idx += 1
        # for 10, 10 got over 2200 with discount 1
        # with discount 9, seems to not die, but runs longer, and pacman
        # cuts off the game at iter 900
        self.assertTrue(total_reward > 2200)
コード例 #6
0
class MctsPolicyTicTacToeTest(unittest.TestCase):
    def setUp(self):
        self.env = TicTacToeEnv()
        self.model = BasicMctsModel(self.env)
        self.policy = MctsPolicy(self.env, self.model, num_simulations=100)

    def test_action_start(self):
        action = self.policy.action()
        states_isfinal_reward = self.env.step(action)
        self.assertEqual(0, action)
        self.assertEqual(([1, 4, 0, 0, 0, 0, 0, 0, 0], False, 0.0),
                         states_isfinal_reward)

    def test_action_win(self):
        self.env.set_states([1, 0, 1, 1, 0, 4, 4, 4, 0])
        action = self.policy.action()
        states_isfinal_reward = self.env.step(action)
        self.assertEqual(1, action)
        self.assertEqual(([1, 1, 1, 1, 0, 4, 4, 4, 0], True, 1.0),
                         states_isfinal_reward)

    def test_action_win_2(self):
        self.env.set_states([1, 1, 4, 0, 0, 4, 1, 4, 0])
        action = self.policy.action()
        states_isfinal_reward = self.env.step(action)
        self.assertEqual(3, action)
        self.assertEqual(([1, 1, 4, 1, 0, 4, 1, 4, 0], True, 1.0),
                         states_isfinal_reward)

    def test_policy_logits(self):
        logits = self.policy.get_policy_logits()
        tf.assert_equal(
            tf.constant([0.14, 0.09, 0.13, 0.09, 0.13, 0.11, 0.09, 0.11, 0.11],
                        dtype=tf.float64), logits)

    def test_choose_action(self):
        self.assertEqual(
            1,
            self.policy.choose_action(
                tf.constant([
                    0.11, 0.116, 0.11, 0.11, 0.11, 0.111, 0.111, 0.111, 0.111
                ])))

    def test_game_deterministic(self):
        while True:
            action = self.policy.action()
            states_isfinal_reward = self.env.step(action)
            states, is_final, reward = states_isfinal_reward
            if is_final:
                break
        self.assertEqual(1.0, reward)

    def play_game_once(self, r_seed):
        self.env = TicTacToeEnv(use_random=True, r_seed=r_seed)
        self.model = BasicMctsModel(self.env, r_seed=r_seed)
        self.policy = MctsPolicy(self.env,
                                 self.model,
                                 num_simulations=100,
                                 r_seed=r_seed)
        while True:
            action = self.policy.action()
            states_isfinal_reward = self.env.step(action)
            states, is_final, reward = states_isfinal_reward
            if is_final:
                return states, is_final, reward

    def test_game_random(self):
        reward_dict = collections.defaultdict(int)
        for r_seed in range(100):
            _, _, reward = self.play_game_once(r_seed)
            reward_dict[reward] += 1
        print('reward distribution: ', reward_dict)
        # 96% winning ratio.
        self.assertEqual({1.0: 96, 0.0: 1, -1.0: 3}, reward_dict)
コード例 #7
0
 def setUp(self):
     self.env = TicTacToeEnv()
     self.model = BasicMctsModel(self.env)
     self.policy = MctsPolicy(self.env, self.model, num_simulations=100)