コード例 #1
0
def initialize(
    mdp: FiniteMarkovDecisionProcess
) -> Tuple[V[S], FinitePolicy]:
    """Initialize value function and policy.

    Initialize the value function to zeros at each state, and initialize the
    policy to a random choice of the action space at each non-terminal state.

    :param mdp: Object representation of a finite Markov decision process
    :returns: Value function initialized at zeros for each state
    :returns: Random Initial policy
    """
    # Set value function at each state equal to zero
    v_0: V[S] = {s: 0 for s in mdp.states()}
    # Set the policy to be a random choice of the action space at each state
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}
    )
    return v_0, pi_0
コード例 #2
0
ファイル: test_td.py プロジェクト: matteosantama/RL-book
class TestEvaluate(unittest.TestCase):
    def setUp(self):
        random.seed(42)

        self.finite_flip_flop = FlipFlop(0.7)

        self.finite_mdp = FiniteMarkovDecisionProcess({
            True: {
                True: Categorical({
                    (True, 1.0): 0.7,
                    (False, 2.0): 0.3
                }),
                False: Categorical({
                    (True, 1.0): 0.3,
                    (False, 2.0): 0.7
                }),
            },
            False: {
                True: Categorical({
                    (False, 1.0): 0.7,
                    (True, 2.0): 0.3
                }),
                False: Categorical({
                    (False, 1.0): 0.3,
                    (True, 2.0): 0.7
                }),
            },
        })

    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False

    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False