コード例 #1
0
ファイル: control_utils.py プロジェクト: shenoy1/RL-book
def glie_mc_finite_learning_rate_correctness(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        episode_length_tolerance: float, num_episodes: int) -> None:
    qvfs: Iterator[QValueFunctionApprox[S, A]] = \
        glie_mc_finite_control_learning_rate(
            fmdp=fmdp,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            gamma=gamma,
            epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
            episode_length_tolerance=episode_length_tolerance
        )
    final_qvf: QValueFunctionApprox[S, A] = \
        iterate.last(itertools.islice(qvfs, num_episodes))
    opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf)

    print(f"GLIE MC Optimal Value Function with {num_episodes:d} episodes")
    pprint(opt_vf)
    print(f"GLIE MC Optimal Policy with {num_episodes:d} episodes")
    print(opt_policy)

    true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma)

    print("True Optimal Value Function")
    pprint(true_opt_vf)
    print("True Optimal Policy")
    print(true_opt_policy)
コード例 #2
0
ファイル: test_td.py プロジェクト: matteosantama/RL-book
    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False
コード例 #3
0
ファイル: control_utils.py プロジェクト: shenoy1/RL-book
def q_learning_finite_learning_rate_correctness(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    gamma: float,
    epsilon: float,
    max_episode_length: int,
    num_updates: int,
) -> None:
    qvfs: Iterator[QValueFunctionApprox[S, A]] = \
        q_learning_finite_learning_rate(
            fmdp=fmdp,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            gamma=gamma,
            epsilon=epsilon,
            max_episode_length=max_episode_length
        )
    final_qvf: QValueFunctionApprox[S, A] = \
        iterate.last(itertools.islice(qvfs, num_updates))
    opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf)

    print(f"Q-Learning ptimal Value Function with {num_updates:d} updates")
    pprint(opt_vf)
    print(f"Q-Learning Optimal Policy with {num_updates:d} updates")
    print(opt_policy)

    true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma)

    print("True Optimal Value Function")
    pprint(true_opt_vf)
    print("True Optimal Policy")
    print(true_opt_policy)
コード例 #4
0
def mc_prediction(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    approx_0: ValueFunctionApprox[S],
    γ: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[ValueFunctionApprox[S]]:
    '''Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      traces -- an iterator of simulation traces from an MRP
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1), default: 1
      episode_length_tolerance -- stop iterating once γᵏ ≤ tolerance

    Returns an iterator with updates to the approximated value
    function after each episode.

    '''
    episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
        (returns(trace, γ, episode_length_tolerance) for trace in traces)
    f = approx_0
    yield f

    for episode in episodes:
        f = last(
            f.iterate_updates([(step.state, step.return_)]
                              for step in episode))
        yield f
コード例 #5
0
def mc_finite_learning_rate_correctness(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    tolerance: float,
    num_episodes: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[S, float],
) -> None:
    mc_vfs: Iterator[FunctionApprox[S]] = mc_finite_prediction_learning_rate(
        fmrp=fmrp,
        gamma=gamma,
        tolerance=tolerance,
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent,
        initial_vf_dict=initial_vf_dict,
    )
    final_mc_vf: FunctionApprox[S] = iterate.last(
        itertools.islice(mc_vfs, num_episodes)
    )
    print(
        "Decaying-Learning-Rate-MC Value Function with " + f"{num_episodes:d} episodes"
    )
    pprint({s: round(final_mc_vf(s), 3) for s in fmrp.non_terminal_states})
    print("True Value Function")
    fmrp.display_value_function(gamma=gamma)
コード例 #6
0
def td_lambda_finite_learning_rate_correctness(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    lambd: float,
    episode_length: int,
    num_episodes: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> None:
    td_lambda_vfs: Iterator[ValueFunctionApprox[S]] = \
        td_lambda_finite_prediction_learning_rate(
            fmrp=fmrp,
            gamma=gamma,
            lambd=lambd,
            episode_length=episode_length,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            initial_vf_dict=initial_vf_dict
        )
    final_td_lambda_vf: ValueFunctionApprox[S] = \
        iterate.last(itertools.islice(
            td_lambda_vfs,
            episode_length * num_episodes
        ))
    print("Decaying-Learning-Rate-TD-Lambda Value Function with " +
          f"{num_episodes:d} episodes")
    pprint({s: round(final_td_lambda_vf(s), 3)
            for s in fmrp.non_terminal_states})
    print("True Value Function")
    fmrp.display_value_function(gamma=gamma)
コード例 #7
0
ファイル: test_td.py プロジェクト: matteosantama/RL-book
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
コード例 #8
0
def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]],
                  gamma: float, num_episodes: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            mc.mc_prediction(traces=episodes_stream,
                             approx_0=Tabular(),
                             γ=gamma,
                             tolerance=1e-10), num_episodes)).values_map
コード例 #9
0
def td_prediction(experiences_stream: Iterator[TransitionStep[S]],
                  gamma: float, num_experiences: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            td.td_prediction(
                transitions=experiences_stream,
                approx_0=Tabular(count_to_weight_func=learning_rate_schedule(
                    initial_learning_rate=0.01, half_life=10000,
                    exponent=0.5)),
                γ=gamma), num_experiences)).values_map
コード例 #10
0
def mc_finite_equal_wts_correctness(
        fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float,
        num_episodes: int, initial_vf_dict: Mapping[S, float]) -> None:
    mc_vfs: Iterator[FunctionApprox[S]] = \
        mc_finite_prediction_equal_wts(
            fmrp=fmrp,
            gamma=gamma,
            tolerance=tolerance,
            initial_vf_dict=initial_vf_dict
        )
    final_mc_vf: FunctionApprox[S] = \
        iterate.last(itertools.islice(mc_vfs, num_episodes))
    print(f"Equal-Weights-MC Value Function with {num_episodes:d} episodes")
    pprint({s: round(final_mc_vf(s), 3) for s in fmrp.non_terminal_states})
    print("True Value Function")
    fmrp.display_value_function(gamma=gamma)
コード例 #11
0
ファイル: windy_grid.py プロジェクト: efpm04013/RL-book
 def get_q_learning_vf_and_policy(
         self, epsilon: float, learning_rate: float,
         num_updates: int) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
     qvfs: Iterator[FunctionApprox[Tuple[Cell, Move]]] = \
         q_learning_finite_learning_rate(
             fmdp=self.get_finite_mdp(),
             initial_learning_rate=learning_rate,
             half_life=1e8,
             exponent=1.0,
             gamma=1.0,
             epsilon=epsilon,
             max_episode_length=int(1e8)
         )
     final_qvf: FunctionApprox[Tuple[Cell, Move]] = \
         iterate.last(itertools.islice(qvfs, num_updates))
     return get_vf_and_policy_from_qvf(mdp=self.get_finite_mdp(),
                                       qvf=final_qvf)
コード例 #12
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.non_terminal_states
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1
        )

        uniform_policy: FinitePolicy[bool, bool] =\
            FinitePolicy({
                s.state: Choose(self.finite_mdp.actions(s))
                for s in self.finite_mdp.non_terminal_states
            })

        transitions: Iterable[mdp.TransitionStep[bool, bool]] =\
            self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.non_terminal_states),
                uniform_policy
            )

        qs = td.q_learning_external_transitions(
            transitions,
            self.finite_mdp.actions,
            q_0,
            γ=0.99
        )

        q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\
            iterate.last(
                cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]],
                     itertools.islice(qs, 20000))
            )

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [NonTerminal(True), NonTerminal(False)]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
コード例 #13
0
ファイル: vampire.py プロジェクト: shenoy1/RL-book
 def lspi_vf_and_policy(self) -> \
         Tuple[V[int], FiniteDeterministicPolicy[int, int]]:
     transitions: Iterable[TransitionStep[int, int]] = itertools.islice(
         self.lspi_transitions(), 50000)
     qvf_iter: Iterator[LinearFunctionApprox[Tuple[
         NonTerminal[int], int]]] = least_squares_policy_iteration(
             transitions=transitions,
             actions=self.actions,
             feature_functions=self.lspi_features(4, 4),
             initial_target_policy=DeterministicPolicy(
                 lambda s: int(s / 2)),
             γ=1.0,
             ε=1e-5)
     qvf: LinearFunctionApprox[Tuple[NonTerminal[int], int]] = \
         iterate.last(
             itertools.islice(
                 qvf_iter,
                 100
             )
         )
     return get_vf_and_policy_from_qvf(self, qvf)
コード例 #14
0
ファイル: windy_grid.py プロジェクト: shenoy1/RL-book
 def get_glie_sarsa_vf_and_policy(
     self,
     epsilon_as_func_of_episodes: Callable[[int], float],
     learning_rate: float,
     num_updates: int
 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
     qvfs: Iterator[QValueFunctionApprox[Cell, Move]] = \
         glie_sarsa_finite_learning_rate(
             fmdp=self.get_finite_mdp(),
             initial_learning_rate=learning_rate,
             half_life=1e8,
             exponent=1.0,
             gamma=1.0,
             epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
             max_episode_length=int(1e8)
         )
     final_qvf: QValueFunctionApprox[Cell, Move] = \
         iterate.last(itertools.islice(qvfs, num_updates))
     return get_vf_and_policy_from_qvf(
         mdp=self.get_finite_mdp(),
         qvf=final_qvf
     )
コード例 #15
0
    def test_last(self):
        self.assertEqual(last(range(0, 5)), 4)
        self.assertEqual(last(range(0, 10)), 9)

        self.assertRaises(Exception, lambda: last([]))
コード例 #16
0
ファイル: test_lambda_return.py プロジェクト: shenoy1/RL-book
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
)

episodes: Iterable[Iterable[TransitionStep[InventoryState]]] = \
    si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        (itertools.islice(episode, episode_length) for episode in episodes)

vf_iter: Iterator[Tabular[NonTerminal[InventoryState]]] = \
    lambda_return_prediction(
        traces=traces,
        approx_0=approx_0,
        γ=gamma,
        lambd=lambda_param
    )

vf: Tabular[NonTerminal[InventoryState]] = \
    iterate.last(itertools.islice(vf_iter, num_episodes))

pprint(vf.values_map)
si_mrp.display_value_function(gamma=gamma)





コード例 #17
0
    q_learning_experience_replay(
        mdp=si_mdp,
        policy_from_q=lambda f, m: epsilon_greedy_policy(
            q=f,
            mdp=m,
            ϵ=epsilon
        ),
        states=Choose(si_mdp.non_terminal_states),
        approx_0=Tabular(
            count_to_weight_func=learning_rate_schedule(
                initial_learning_rate=initial_learning_rate,
                half_life=learning_rate_half_life,
                exponent=learning_rate_exponent
            )
        ),
        γ=gamma,
        max_episode_length=episode_length,
        mini_batch_size=mini_batch_size,
        weights_decay_half_life=time_decay_half_life
    )

qvf: QValueFunctionApprox[InventoryState, int] = iterate.last(
    itertools.islice(q_iter, num_updates))
vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf)
pprint(vf)
print(pol)

true_vf, true_pol = value_iteration_result(mdp=si_mdp, gamma=gamma)
pprint(true_vf)
print(true_pol)
コード例 #18
0
    def test_last(self):
        self.assertEqual(last(range(0, 5)), 4)
        self.assertEqual(last(range(0, 10)), 9)

        self.assertEqual(last([]), None)
コード例 #19
0
ファイル: random_walk_lstd.py プロジェクト: shenoy1/RL-book
    itertools.islice(transitions, num_transitions)

initial_learning_rate: float = 0.5
half_life: float = 1000
exponent: float = 0.5
approx0: Tabular[NonTerminal[int]] = Tabular(
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent))

td_func: Tabular[NonTerminal[int]] = \
    iterate.last(itertools.islice(
        td_prediction(
            transitions=td_transitions,
            approx_0=approx0,
            γ=gamma
        ),
        num_transitions
    ))
td_vf: np.ndarray = td_func.evaluate(nt_states)

num_polynomials: int = 5
features: Sequence[Callable[[NonTerminal[int]], float]] = \
    laguerre_state_features(num_polynomials)
lstd_transitions: Iterable[TransitionStep[int]] = \
    itertools.islice(transitions, num_transitions)
epsilon: float = 1e-4

lstd_func: LinearFunctionApprox[NonTerminal[int]] = \
    least_squares_td(
        transitions=lstd_transitions,
コード例 #20
0
        )
    num_episodes = 100000

    print("Value Function (TD Function Approximation)")
    print("--------------")
    initial_learning_rate: float = 0.03
    half_life: float = 1000.0
    exponent: float = 0.5
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        transitions=unit_experiences_accumulated,
        approx_0=Tabular(count_to_weight_func=learning_rate_func),
        γ=user_gamma)
    final_td_vf: FunctionApprox[InventoryState] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
    pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states})
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt(
        transitions=unit_experiences_accumulated,
        vf={s: 0
            for s in si_mrp.non_terminal_states},
        γ=user_gamma)
    final_td_vf: Dict[InventoryState, float] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
    pprint({s: round(final_td_vf[s], 3) for s in si_mrp.non_terminal_states})
コード例 #21
0
replay: Iterator[Sequence[TransitionStep[str]]] = \
    exp_replay_memory.replay(fixed_transitions, 1)


def replay_transitions(replay=replay) -> Iterator[TransitionStep[str]]:
    while True:
        yield next(replay)[0]


num_iterations: int = 100000

td1_vf: ValueFunctionApprox[str] = iterate.last(
    itertools.islice(
        td_prediction(
            replay_transitions(),
            td_fa,
            gamma
        ),
        num_iterations
    )
)

print("Result of Batch TD1 Prediction")
print("V[A] = %.3f" % td1_vf(a))
print("V[B] = %.3f" % td1_vf(b))

td2_vf: ValueFunctionApprox[str] = batch_td_prediction(
    fixed_transitions,
    td_fa,
    gamma
)
コード例 #22
0
ファイル: prob11_3a.py プロジェクト: lkourti/RL-book
                                      holding_cost=user_holding_cost,
                                      stockout_cost=user_stockout_cost)

    print("Value Function (Exact)")
    print("--------------")
    si_mrp.display_value_function(gamma=user_gamma)
    print()

    print("Value Function (MC Function Approximation)")
    print("--------------")
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    it: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        traces=traces, approx_0=Tabular(), γ=user_gamma)
    num_traces = 10000
    last_vf_mc: FunctionApprox[InventoryState] = last(islice(it, num_traces))
    pprint({
        s: round(last_vf_mc.evaluate([s])[0], 3)
        for s in si_mrp.non_terminal_states
    })
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    it: Iterator[Dict[InventoryState, float]] = evaluate_mrp_mc(
        traces=traces,
        vf={s: 0
            for s in si_mrp.non_terminal_states},
        γ=user_gamma)