Ejemplo n.º 1
0
class TestFiniteDistribution(unittest.TestCase):
    def setUp(self):
        self.die = Choose({1, 2, 3, 4, 5, 6})

        self.ragged = Categorical({0: 0.9, 1: 0.05, 2: 0.025, 3: 0.025})

    def test_map(self):
        plusOne = self.die.map(lambda x: x + 1)
        assert_almost_equal(self, plusOne, Choose({2, 3, 4, 5, 6, 7}))

        evenOdd = self.die.map(lambda x: x % 2 == 0)
        assert_almost_equal(self, evenOdd, Choose({True, False}))

        greaterThan4 = self.die.map(lambda x: x > 4)
        assert_almost_equal(self, greaterThan4,
                            Categorical({
                                True: 1 / 3,
                                False: 2 / 3
                            }))

    def test_expectation(self):
        self.assertAlmostEqual(self.die.expectation(float), 3.5)

        even = self.die.map(lambda n: n % 2 == 0)
        self.assertAlmostEqual(even.expectation(float), 0.5)

        self.assertAlmostEqual(self.ragged.expectation(float), 0.175)
Ejemplo n.º 2
0
def sarsa_control(start_states: Distribution[S],
                  transition_fcn: Callable[[S, A], Tuple[S, float]],
                  state_action: Mapping[S, List[A]],
                  approx_0: FunctionApprox[Tuple[S, A]], gamma: float,
                  ϵ: float) -> Iterable[FunctionApprox[Tuple[S, A]]]:
    """
    Update Q-value function approximate using SARSA
    Initialize first state by start_states
    """
    q = approx_0
    state = start_states.sample()
    action = Choose(set(state_action[state])).sample()
    while True:
        # next_state, reward = transition_fcn(state, action)
        next_state, reward = transition_fcn[state][action].sample()
        # use ϵ-greedy policy to get next_action
        explore = Bernoulli(ϵ)
        if explore.sample():
            next_action = Choose(set(state_action[next_state])).sample()
        else:
            next_action = state_action[next_state][np.argmax(
                [q((next_state, a)) for a in state_action[next_state]])]
        q = q.update([(state, action), reward + gamma * q(
            (next_state, next_action))])
        state, action = next_state, next_action
        yield q
Ejemplo n.º 3
0
class TestChoose(unittest.TestCase):
    def setUp(self):
        self.one = Choose({1})
        self.six = Choose({1, 2, 3, 4, 5, 6})
        self.repeated = Choose([1, 1, 1, 2])

    def test_choose(self):
        assert_almost_equal(self, self.one, Constant(1))
        self.assertAlmostEqual(self.one.probability(1), 1.)
        self.assertAlmostEqual(self.one.probability(0), 0.)

        categorical_six = Categorical({x: 1 / 6 for x in range(1, 7)})
        assert_almost_equal(self, self.six, categorical_six)
        self.assertAlmostEqual(self.six.probability(1), 1 / 6)
        self.assertAlmostEqual(self.six.probability(0), 0.)

    def test_repeated(self):
        counts = Counter(self.repeated.sample_n(1000))
        self.assertLess(abs(counts[1] - 750), 50)
        self.assertLess(abs(counts[2] - 250), 50)

        table = self.repeated.table()
        self.assertAlmostEqual(table[1], 0.75)
        self.assertAlmostEqual(table[2], 0.25)

        counts = Counter(self.repeated.sample_n(1000))
        self.assertLess(abs(counts[1] - 750), 50)
        self.assertLess(abs(counts[2] - 250), 50)
Ejemplo n.º 4
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
Ejemplo n.º 5
0
 def lspi_transitions(self) -> Iterator[TransitionStep[int, int]]:
     states_distribution: Choose[NonTerminal[int]] = \
         Choose(self.non_terminal_states)
     while True:
         state: NonTerminal[int] = states_distribution.sample()
         action: int = Choose(range(state.state)).sample()
         next_state, reward = self.step(state, action).sample()
         transition: TransitionStep[int, int] = TransitionStep(
             state=state,
             action=action,
             next_state=next_state,
             reward=reward)
         yield transition
Ejemplo n.º 6
0
    def test_map(self):
        plusOne = self.die.map(lambda x: x + 1)
        assert_almost_equal(self, plusOne, Choose({2, 3, 4, 5, 6, 7}))

        evenOdd = self.die.map(lambda x: x % 2 == 0)
        assert_almost_equal(self, evenOdd, Choose({True, False}))

        greaterThan4 = self.die.map(lambda x: x > 4)
        assert_almost_equal(self, greaterThan4,
                            Categorical({
                                True: 1 / 3,
                                False: 2 / 3
                            }))
Ejemplo n.º 7
0
class TestChoose(unittest.TestCase):
    def setUp(self):
        self.one = Choose({1})
        self.six = Choose({1, 2, 3, 4, 5, 6})

    def test_choose(self):
        assert_almost_equal(self, self.one, Constant(1))
        self.assertAlmostEqual(self.one.probability(1), 1.)
        self.assertAlmostEqual(self.one.probability(0), 0.)

        categorical_six = Categorical({x: 1 / 6 for x in range(1, 7)})
        assert_almost_equal(self, self.six, categorical_six)
        self.assertAlmostEqual(self.six.probability(1), 1 / 6)
        self.assertAlmostEqual(self.six.probability(0), 0.)
Ejemplo n.º 8
0
class TestDistribution(unittest.TestCase):
    def setUp(self):
        self.finite = Choose(range(0, 6))
        self.sampled = SampledDistribution(lambda: self.finite.sample(),
                                           100000)

    def test_expectation(self):
        expected_finite = self.finite.expectation(lambda x: x)
        expected_sampled = self.sampled.expectation(lambda x: x)
        self.assertLess(abs(expected_finite - expected_sampled), 0.02)

    def test_sample_n(self):
        samples = self.sampled.sample_n(10)
        self.assertEqual(len(samples), 10)
        self.assertTrue(all(0 <= s < 6 for s in samples))
    def test_evaluate_mrp(self):
        start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()})

        v = iterate.converged(
            evaluate_mrp(
                self.finite_flip_flop,
                γ=0.99,
                approx_0=start,
                non_terminal_states_distribution=Choose(
                    set(self.finite_flip_flop.states())),
                num_state_samples=5,
            ),
            done=lambda a, b: a.within(b, 1e-4),
        )

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            self.assertLess(abs(v(s) - 170), 1.0)

        v_finite = iterate.converged(
            evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start),
            done=lambda a, b: a.within(b, 1e-4),
        )

        assert_allclose(v.evaluate([True, False]),
                        v_finite.evaluate([True, False]),
                        rtol=0.01)
Ejemplo n.º 10
0
Archivo: lspi.py Proyecto: GYY7/RL-book
def lspi(memory: List[mdp.TransitionStep[S]], feature_map: Dict[Tuple[S, A],
                                                                List[float]],
         state_action: Dict[S, List[A]], m: int, gamma: float,
         ϵ: float) -> Iterable[Dict[Tuple[S, A], float]]:
    """
    update A and b to get w*= inverse(A)b and update deterministic policy
    feature_map:  key: state, value: phi(s_i) is a vector of dimension m
    """
    # initialize A, b
    A = np.random.rand(m, m)
    b = np.zeros((m, 1))
    w = np.linalg.inv(A) @ b
    while True:
        transition = random.choice(memory)
        state = transition.state
        next_state = transition.next_state
        feature_state = np.array(feature_map[(state, transition.action)])
        # next_action is derived from ϵ-policy
        explore = Bernoulli(ϵ)
        if explore.sample():
            next_action = Choose(set(state_action[next_state])).sample()
        else:
            next_action = state_action[next_state][np.argmax([
                np.array(feature_map[(next_state, action)]) @ w
                for action in state_action[next_state]
            ])]
        feature_next_state = np.array(feature_map[(next_state, next_action)])
        A += feature_state @ (feature_state - gamma * feature_next_state).T
        b += feature_state * transition.reward
        w = np.linalg.inv(A) @ b
        yield {
            s_a: np.array(feature_map[s_a]) @ w
            for s_a in feature_map.keys()
        }
Ejemplo n.º 11
0
    def test_value_iteration(self):
        mdp_map: Mapping[NonTerminal[InventoryState],
                         float] = value_iteration_result(
                             self.si_mdp, self.gamma)[0]
        # print(mdp_map)
        mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states])

        fa = Dynamic({s: 0.0 for s in self.states})
        mdp_finite_fa = iterate.converged(value_iteration_finite(
            self.si_mdp, self.gamma, fa),
                                          done=lambda a, b: a.within(b, 1e-5))
        # print(mdp_finite_fa.values_map)
        mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.01)

        mdp_fa = iterate.converged(value_iteration(self.si_mdp,
                                                   self.gamma,
                                                   fa,
                                                   Choose(self.states),
                                                   num_state_samples=30),
                                   done=lambda a, b: a.within(b, 1e-5))
        # print(mdp_fa.values_map)
        mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states)
        self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 0.01)
Ejemplo n.º 12
0
    def test_value_iteration(self):
        vpstar = optimal_vf_and_policy(self.mdp_seq, 1.)
        states = self.single_step_mdp.states()
        fa_dynamic = Dynamic({s: 0.0 for s in states})
        fa_tabular = Tabular()
        distribution = Choose(set(states))
        approx_vpstar_finite = back_opt_vf_and_policy_finite(
            [(self.mdp_seq[i], fa_dynamic) for i in range(self.steps)],
            1.
        )
        approx_vpstar = back_opt_vf_and_policy(
            [(self.single_step_mdp, fa_tabular, distribution)
             for _ in range(self.steps)],
            1.,
            num_state_samples=120,
            error_tolerance=0.01
        )

        for t, ((v1, _), (v2, _), (v3, _)) in enumerate(zip(
                vpstar,
                approx_vpstar_finite,
                approx_vpstar
        )):
            states = self.mdp_seq[t].keys()
            v1_arr = np.array([v1[s] for s in states])
            v2_arr = v2.evaluate(states)
            v3_arr = v3.evaluate(states)
            self.assertLess(max(abs(v1_arr - v2_arr)), 0.001)
            self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
Ejemplo n.º 13
0
    def test_evaluate_mrp(self):
        vf = evaluate(self.mrp_seq, 1.)
        states = self.single_step_mrp.states()
        fa_dynamic = Dynamic({s: 0.0 for s in states})
        fa_tabular = Tabular()
        distribution = Choose(set(states))
        approx_vf_finite = backward_evaluate_finite(
            [(self.mrp_seq[i], fa_dynamic) for i in range(self.steps)],
            1.
        )
        approx_vf = backward_evaluate(
            [(self.single_step_mrp, fa_tabular, distribution)
             for _ in range(self.steps)],
            1.,
            num_state_samples=120,
            error_tolerance=0.01
        )

        for t, (v1, v2, v3) in enumerate(zip(
                vf,
                approx_vf_finite,
                approx_vf
        )):
            states = self.mrp_seq[t].keys()
            v1_arr = np.array([v1[s] for s in states])
            v2_arr = v2.evaluate(states)
            v3_arr = v3.evaluate(states)
            self.assertLess(max(abs(v1_arr - v2_arr)), 0.001)
            self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
Ejemplo n.º 14
0
    def test_evaluate_mrp(self):
        mrp_vf1: np.ndarray = self.implied_mrp.get_value_function_vec(
            self.gamma)
        # print({s: mrp_vf1[i] for i, s in enumerate(self.states)})

        fa = Dynamic({s: 0.0 for s in self.states})
        mrp_finite_fa = iterate.converged(
            evaluate_finite_mrp(self.implied_mrp, self.gamma, fa),
            done=lambda a, b: a.within(b, 1e-4),
        )
        # print(mrp_finite_fa.values_map)
        mrp_vf2: np.ndarray = mrp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mrp_vf1 - mrp_vf2)), 0.001)

        mrp_fa = iterate.converged(
            evaluate_mrp(
                self.implied_mrp,
                self.gamma,
                fa,
                Choose(self.states),
                num_state_samples=30,
            ),
            done=lambda a, b: a.within(b, 0.1),
        )
        # print(mrp_fa.values_map)
        mrp_vf3: np.ndarray = mrp_fa.evaluate(self.states)
        self.assertLess(max(abs(mrp_vf1 - mrp_vf3)), 1.0)
Ejemplo n.º 15
0
def glie_mc_finite_control_learning_rate(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    gamma: float,
    epsilon_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-5
) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return mc.glie_mc_control(
        mdp=fmdp,
        states=Choose(fmdp.non_terminal_states),
        approx_0=Tabular(values_map=initial_qvf_dict,
                         count_to_weight_func=learning_rate_func),
        γ=gamma,
        ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
        episode_length_tolerance=episode_length_tolerance)
Ejemplo n.º 16
0
    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False
Ejemplo n.º 17
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess[S, A],
    gamma: float,
    matrix_method_for_mrp_eval: bool = False
) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[V[S], FinitePolicy[S, A]])\
            -> Tuple[V[S], FinitePolicy[S, A]]:

        vf, pi = vf_policy
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)
        policy_vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in
                           enumerate(mrp.get_value_function_vec(gamma))}\
            if matrix_method_for_mrp_eval else evaluate_mrp_result(mrp, gamma)
        improved_pi: FinitePolicy[S, A] = greedy_policy_from_vf(
            mdp, policy_vf, gamma)

        return policy_vf, improved_pi

    v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states}
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s)))
         for s in mdp.non_terminal_states})
    return iterate(update, (v_0, pi_0))
Ejemplo n.º 18
0
    def get_q_learning_vf_and_policy(
            self,
            states_actions_dict: Mapping[Cell, Optional[Set[Move]]],
            sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
            episodes: int = 10000,
            step_size: float = 0.01,
            epsilon: float = 0.1) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the Q-learning algorithm's Q-Value Function updates.
            '''

        vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[Cell, Move] = FinitePolicy({
            s: Constant(max(d.items(), key=itemgetter(1))[0])
            for s, d in q.items()
        })
        return (vf_dict, policy)
Ejemplo n.º 19
0
 def states_sampler_func() -> NonTerminal[PriceAndShares]:
     price: float = self.initial_price_distribution.sample()
     rem: int = self.shares
     for i in range(t):
         sell: int = Choose(range(rem + 1)).sample()
         price = self.price_dynamics[i](PriceAndShares(
             price=price, shares=rem)).sample()
         rem -= sell
     return NonTerminal(PriceAndShares(price=price, shares=rem))
Ejemplo n.º 20
0
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            if explore.sample():
                return Choose(set(mdp.actions(s)))

            _, action = q.argmax((s, a) for a in mdp.actions(s))
            return Constant(action)
Ejemplo n.º 21
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.non_terminal_states
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1
        )

        uniform_policy: FinitePolicy[bool, bool] =\
            FinitePolicy({
                s.state: Choose(self.finite_mdp.actions(s))
                for s in self.finite_mdp.non_terminal_states
            })

        transitions: Iterable[mdp.TransitionStep[bool, bool]] =\
            self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.non_terminal_states),
                uniform_policy
            )

        qs = td.q_learning_external_transitions(
            transitions,
            self.finite_mdp.actions,
            q_0,
            γ=0.99
        )

        q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\
            iterate.last(
                cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]],
                     itertools.islice(qs, 20000))
            )

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [NonTerminal(True), NonTerminal(False)]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
Ejemplo n.º 22
0
 def states_sampler_func() -> PriceAndShares:
     price: float = self.initial_price_distribution.sample()
     rem: int = self.shares
     x: float = self.init_x_distrib.sample()
     for i in range(t):
         sell: int = Choose(set(range(rem + 1))).sample()
         price = self.price_dynamics[i](PriceAndShares(price=price,
                                                       shares=rem,
                                                       x=x)).sample()
         rem -= sell
         new_x = self.pho * x + Uniform().sample()
     return PriceAndShares(price=price, shares=rem, x=new_x)
Ejemplo n.º 23
0
    def test_evaluate_finite_mrp(self):
        start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()})
        traces = self.finite_flip_flop.reward_traces(Choose({True, False}))
        v = iterate.converged(
            mc.evaluate_mrp(traces, γ=0.99, approx_0=start),
            # Loose bound of 0.025 to speed up test.
            done=lambda a, b: a.within(b, 0.025))

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            # Intentionally loose bound—otherwise test is too slow.
            # Takes >1s on my machine otherwise.
            self.assertLess(abs(v(s) - 170), 1.0)
Ejemplo n.º 24
0
def glie_mc_finite_control_equal_wts(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    gamma: float,
    epsilon_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-5,
) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    return mc.glie_mc_control(
        mdp=fmdp,
        states=Choose(fmdp.non_terminal_states),
        approx_0=Tabular(values_map=initial_qvf_dict),
        γ=gamma,
        ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
        episode_length_tolerance=episode_length_tolerance)
Ejemplo n.º 25
0
    def get_sarsa_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Set[Move]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01
    ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-terminal cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items()}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            epsilon: float = 1.0 / (episode_num + 1)
            state: Cell = uniform_states.sample()
            action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon)
            while state in nt_states:
                next_state, reward = sample_func(state, action)
                if next_state in nt_states:
                    next_action: Move = WindyGrid.epsilon_greedy_action(
                        next_state, q, epsilon)
                    q[state][action] += step_size * \
                        (reward + q[next_state][next_action] -
                         q[state][action])
                    action = next_action
                else:
                    q[state][action] += step_size * (reward - q[state][action])
                state = next_state

        vf_dict: V[Cell] = {
            NonTerminal(s): max(d.values())
            for s, d in q.items()
        }
        policy: FiniteDeterministicPolicy[Cell, Move] = \
            FiniteDeterministicPolicy(
                {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()}
            )
        return vf_dict, policy
Ejemplo n.º 26
0
def initialize(
    mdp: FiniteMarkovDecisionProcess
) -> Tuple[V[S], FinitePolicy]:
    """Initialize value function and policy.

    Initialize the value function to zeros at each state, and initialize the
    policy to a random choice of the action space at each non-terminal state.

    :param mdp: Object representation of a finite Markov decision process
    :returns: Value function initialized at zeros for each state
    :returns: Random Initial policy
    """
    # Set value function at each state equal to zero
    v_0: V[S] = {s: 0 for s in mdp.states()}
    # Set the policy to be a random choice of the action space at each state
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}
    )
    return v_0, pi_0
Ejemplo n.º 27
0
def q_learning_finite_learning_rate(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float, epsilon: float,
        max_episode_length: int) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return td.q_learning(mdp=fmdp,
                         policy_from_q=lambda f, m: mc.epsilon_greedy_policy(
                             q=f, mdp=m, ϵ=epsilon),
                         states=Choose(fmdp.non_terminal_states),
                         approx_0=Tabular(
                             values_map=initial_qvf_dict,
                             count_to_weight_func=learning_rate_func),
                         γ=gamma,
                         max_episode_length=max_episode_length)
Ejemplo n.º 28
0
def glie_sarsa_finite_learning_rate(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        max_episode_length: int) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    initial_qvf_dict: Mapping[Tuple[S, A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return td.glie_sarsa(mdp=fmdp,
                         states=Choose(set(fmdp.non_terminal_states)),
                         approx_0=Tabular(
                             values_map=initial_qvf_dict,
                             count_to_weight_func=learning_rate_func),
                         γ=gamma,
                         ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
                         max_episode_length=max_episode_length)
Ejemplo n.º 29
0
    def test_value_iteration(self):
        mdp_map: Mapping[InventoryState, float] = value_iteration_result(
            self.si_mdp, self.gamma)[0]
        # print(mdp_map)
        mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states])

        fa = Dynamic({s: 0.0 for s in self.states})
        mdp_finite_fa = FunctionApprox.converged(
            value_iteration_finite(self.si_mdp, self.gamma, fa))
        # print(mdp_finite_fa.values_map)
        mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.001)

        mdp_fa = FunctionApprox.converged(
            value_iteration(self.si_mdp,
                            self.gamma,
                            fa,
                            Choose(self.states),
                            num_state_samples=30), 0.1)
        # print(mdp_fa.values_map)
        mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states)
        self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 1.0)
Ejemplo n.º 30
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess[S, A], gamma: float,
    approx0: FunctionApprox[S]
) -> Iterator[Tuple[FunctionApprox[S], FinitePolicy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[FunctionApprox[S], FinitePolicy[S, A]])\
            -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]:

        vf, pi = vf_policy
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)
        #policy_vf: FunctionApprox[S] = approximate_policy_evaluation_result(mdp,pi,vf)
        policy_vf: FunctionApprox[S] = evaluate_mrp_result(mrp, gamma, vf)
        improved_pi: FinitePolicy[S, A] = greedy_policy_from_approx_vf(
            mdp, policy_vf, gamma)
        return policy_vf, improved_pi

    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s)))
         for s in mdp.non_terminal_states})
    return iterate(update, (approx0, pi_0))