Beispiel #1
0
 def test_evaluate_episode_delta(self):
     MC = MonteCarlo()
     deltas = []
     for _ in range(100):
         deltas.append(MC.evaluate_episode_delta(MC.generate_episode()))
         self.assertTrue(-1 <= deltas[-1] <= 1)
     self.assertTrue(any(d for d in deltas))
     print('sample path value max deltas:', deltas)
Beispiel #2
0
 def test_generate_episode(self):
     MC = MonteCarlo()
     paths = []
     for _ in range(10):
         states = MC.generate_episode()
         self.assertLess(states[-1], hash(MC.board))
         paths.append(states)
     print('sample state paths:', paths)
Beispiel #3
0
 def test_evaluate_episode(self):
     MC = MonteCarlo()
     for _ in range(10):
         states = MC.generate_episode()
         pre_visits = np.array([MC.visits[s] for s in [0] + states])
         pre_values = np.array([MC.values[s] for s in [0] + states])
         MC.evaluate_episode(states)
         post_visits = np.array([MC.visits[s] for s in [0] + states])
         post_values = np.array([MC.values[s] for s in [0] + states])
         diff_values = pre_values - post_values
         diff_visits = pre_visits - post_visits
         print('sample path value deltas:', diff_values)
         self.assertTrue(np.all(diff_visits) == 1)
Beispiel #4
0
    def test_eg_policy(self):
        MC = MonteCarlo()
        keys = np.zeros(9, int)
        for _ in range(10**3):
            keys[MC.epsilon_greedy_policy(hash(MC.board))] += 1
        print('eg policy random?', keys)

        MC.visits[hash(MC.board)] = MC.epsilon_constant
        keys[:] = 0
        MC.board.push(1)
        MC.values[hash(MC.board)] = 10
        MC.board.pop()

        for _ in range(10**4):
            keys[MC.epsilon_greedy_policy(hash(MC.board))] += 1
        print('eg policy tripled 1,3,5,7?', keys)
Beispiel #5
0
    def test_policy(self):
        MC = MonteCarlo()
        keys = np.zeros(9, int)
        keys = np.zeros(9, int)
        for _ in range(10**3):
            keys[MC.policy()] += 1
        print('random policy?', keys)

        keys[:] = 0
        MC.board.push(4)
        state = hash(MC.board)
        MC.values[state] = -10
        MC.board.pop()

        for _ in range(10**3):
            keys[MC.policy()] += 1
        self.assertEqual(keys[4], 0)
        print('policy null 4', keys)
Beispiel #6
0
    def get_forward_vf(self, pol: SAf) -> Mapping[S, float]:
        sa_dict = self.mdp_rep.state_action_dict
        vf_dict = {s: 0. for s in sa_dict.keys()}
        episodes = 0
        monte = MonteCarlo(self.mdp_rep, True, \
                           self.num_episodes, self.max_steps)

        while episodes < self.num_episodes:
            start_state = self.mdp_rep.init_state_gen()
            mc_path = monte.get_mc_path(pol, start_state)
            rew_arr = np.array([x for _, _, x, _ in mc_path[:-1]])
            state_list = [x for x, _, _, _ in mc_path[:-1]]
            val_arr = np.array([vf_dict[s] for s in state_list])
            if mc_path[-1][0] in self.mdp_rep.terminal_states:
                returns = self.get_returns(rew_arr, val_arr)
            else:
                raise RuntimeError('Max step out of limit')
            for i, r in enumerate(returns):
                s, _, _, _ = mc_path[i]
                vf_dict[s] += self.learning_rate * (returns[i] - vf_dict[s])

            episodes += 1

        return vf_dict
Beispiel #7
0
 def test_run(self):
     MC = MonteCarlo()
     MC.run(max_episodes=10**5, threshold=.01, interval=100, checks=30)
Beispiel #8
0
 def test_run_cutoff(self):
     MC = MonteCarlo()
     MC.run(max_episodes=1000, threshold=.01)