Exemple #1
0
def glie_mc_finite_control_learning_rate(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    gamma: float,
    epsilon_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-5
) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return mc.glie_mc_control(
        mdp=fmdp,
        states=Choose(fmdp.non_terminal_states),
        approx_0=Tabular(values_map=initial_qvf_dict,
                         count_to_weight_func=learning_rate_func),
        γ=gamma,
        ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
        episode_length_tolerance=episode_length_tolerance)
def mc_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    episode_length_tolerance: float,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> Iterator[ValueFunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
    return mc.mc_prediction(
        traces=episodes,
        approx_0=Tabular(
            values_map=initial_vf_dict,
            count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
        episode_length_tolerance=episode_length_tolerance
    )
def td_lambda_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    lambd: float,
    episode_length: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> Iterator[ValueFunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    curtailed_episodes: Iterable[Iterable[TransitionStep[S]]] = \
        (itertools.islice(episode, episode_length) for episode in episodes)
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
    return td_lambda.td_lambda_prediction(
        traces=curtailed_episodes,
        approx_0=Tabular(
            values_map=initial_vf_dict,
            count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
        lambd=lambd
    )
Exemple #4
0
def td_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    episode_length: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[S, float],
) -> Iterator[FunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = fmrp_episodes_stream(fmrp)
    td_experiences: Iterable[TransitionStep[S]] = unit_experiences_from_episodes(
        episodes, episode_length
    )
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent,
    )
    return td.td_prediction(
        transitions=td_experiences,
        approx_0=Tabular(
            values_map=initial_vf_dict, count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
    )
def td_prediction(experiences_stream: Iterator[TransitionStep[S]],
                  gamma: float, num_experiences: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            td.td_prediction(
                transitions=experiences_stream,
                approx_0=Tabular(count_to_weight_func=learning_rate_schedule(
                    initial_learning_rate=0.01, half_life=10000,
                    exponent=0.5)),
                γ=gamma), num_experiences)).values_map
Exemple #6
0
def q_learning_finite_learning_rate(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float, epsilon: float,
        max_episode_length: int) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return td.q_learning(mdp=fmdp,
                         policy_from_q=lambda f, m: mc.epsilon_greedy_policy(
                             q=f, mdp=m, ϵ=epsilon),
                         states=Choose(fmdp.non_terminal_states),
                         approx_0=Tabular(
                             values_map=initial_qvf_dict,
                             count_to_weight_func=learning_rate_func),
                         γ=gamma,
                         max_episode_length=max_episode_length)
Exemple #7
0
def glie_sarsa_finite_learning_rate(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        max_episode_length: int) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    initial_qvf_dict: Mapping[Tuple[S, A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return td.glie_sarsa(mdp=fmdp,
                         states=Choose(set(fmdp.non_terminal_states)),
                         approx_0=Tabular(
                             values_map=initial_qvf_dict,
                             count_to_weight_func=learning_rate_func),
                         γ=gamma,
                         ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
                         max_episode_length=max_episode_length)
    {s: 0. for s in si_mrp.non_terminal_states}

gamma: float = 0.9
lambda_param = 0.3
num_episodes = 10000

episode_length: int = 100
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5

approx_0: Tabular[NonTerminal[InventoryState]] = Tabular(
    values_map=initial_vf_dict,
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
)

episodes: Iterable[Iterable[TransitionStep[InventoryState]]] = \
    si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        (itertools.islice(episode, episode_length) for episode in episodes)

vf_iter: Iterator[Tabular[NonTerminal[InventoryState]]] = \
    lambda_return_prediction(
        traces=traces,
        approx_0=approx_0,
        γ=gamma,
        lambd=lambda_param
Exemple #9
0
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    episode_length: int = 100
    unit_experiences_accumulated: Iterable[TransitionStep[InventoryState]] = \
        itertools.chain.from_iterable(
            itertools.islice(trace, episode_length) for trace in traces
        )
    num_episodes = 100000

    print("Value Function (TD Function Approximation)")
    print("--------------")
    initial_learning_rate: float = 0.03
    half_life: float = 1000.0
    exponent: float = 0.5
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        transitions=unit_experiences_accumulated,
        approx_0=Tabular(count_to_weight_func=learning_rate_func),
        γ=user_gamma)
    final_td_vf: FunctionApprox[InventoryState] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
    pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states})
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt(
        transitions=unit_experiences_accumulated,
        vf={s: 0
    fixed_traces,
    mc_fa,
    gamma
)

print("Result of Batch MC Prediction")
print("V[A] = %.3f" % mc_vf(a))
print("V[B] = %.3f" % mc_vf(b))

fixed_transitions: Sequence[TransitionStep[str]] = \
    [t for tr in fixed_traces for t in tr]

td_fa: Tabular[NonTerminal[str]] = Tabular(
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=0.1,
        half_life=10000,
        exponent=0.5
    )
)

exp_replay_memory: ExperienceReplayMemory[TransitionStep[str]] = \
    ExperienceReplayMemory()

replay: Iterator[Sequence[TransitionStep[str]]] = \
    exp_replay_memory.replay(fixed_transitions, 1)


def replay_transitions(replay=replay) -> Iterator[TransitionStep[str]]:
    while True:
        yield next(replay)[0]