def glie_mc_finite_control_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float = 1e-5 ) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return mc.glie_mc_control( mdp=fmdp, states=Choose(fmdp.non_terminal_states), approx_0=Tabular(values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=episode_length_tolerance)
def mc_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, episode_length_tolerance: float, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[NonTerminal[S], float] ) -> Iterator[ValueFunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) return mc.mc_prediction( traces=episodes, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, episode_length_tolerance=episode_length_tolerance )
def td_lambda_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, lambd: float, episode_length: int, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[NonTerminal[S], float] ) -> Iterator[ValueFunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) curtailed_episodes: Iterable[Iterable[TransitionStep[S]]] = \ (itertools.islice(episode, episode_length) for episode in episodes) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) return td_lambda.td_lambda_prediction( traces=curtailed_episodes, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, lambd=lambd )
def td_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, episode_length: int, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[S, float], ) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = fmrp_episodes_stream(fmrp) td_experiences: Iterable[TransitionStep[S]] = unit_experiences_from_episodes( episodes, episode_length ) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent, ) return td.td_prediction( transitions=td_experiences, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, )
def td_prediction(experiences_stream: Iterator[TransitionStep[S]], gamma: float, num_experiences: int) -> Mapping[S, float]: return iterate.last( itertools.islice( td.td_prediction( transitions=experiences_stream, approx_0=Tabular(count_to_weight_func=learning_rate_schedule( initial_learning_rate=0.01, half_life=10000, exponent=0.5)), γ=gamma), num_experiences)).values_map
def q_learning_finite_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon: float, max_episode_length: int) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return td.q_learning(mdp=fmdp, policy_from_q=lambda f, m: mc.epsilon_greedy_policy( q=f, mdp=m, ϵ=epsilon), states=Choose(fmdp.non_terminal_states), approx_0=Tabular( values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, max_episode_length=max_episode_length)
def glie_sarsa_finite_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], max_episode_length: int) -> Iterator[FunctionApprox[Tuple[S, A]]]: initial_qvf_dict: Mapping[Tuple[S, A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return td.glie_sarsa(mdp=fmdp, states=Choose(set(fmdp.non_terminal_states)), approx_0=Tabular( values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=max_episode_length)
{s: 0. for s in si_mrp.non_terminal_states} gamma: float = 0.9 lambda_param = 0.3 num_episodes = 10000 episode_length: int = 100 initial_learning_rate: float = 0.03 half_life: float = 1000.0 exponent: float = 0.5 approx_0: Tabular[NonTerminal[InventoryState]] = Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) ) episodes: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(si_mrp.non_terminal_states)) traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ (itertools.islice(episode, episode_length) for episode in episodes) vf_iter: Iterator[Tabular[NonTerminal[InventoryState]]] = \ lambda_return_prediction( traces=traces, approx_0=approx_0, γ=gamma, lambd=lambda_param
traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states))) episode_length: int = 100 unit_experiences_accumulated: Iterable[TransitionStep[InventoryState]] = \ itertools.chain.from_iterable( itertools.islice(trace, episode_length) for trace in traces ) num_episodes = 100000 print("Value Function (TD Function Approximation)") print("--------------") initial_learning_rate: float = 0.03 half_life: float = 1000.0 exponent: float = 0.5 learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp( transitions=unit_experiences_accumulated, approx_0=Tabular(count_to_weight_func=learning_rate_func), γ=user_gamma) final_td_vf: FunctionApprox[InventoryState] = \ last(itertools.islice(td_vfs, episode_length * num_episodes)) pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states}) print() print("Value Function (Tabular MC from scratch)") print("--------------") td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt( transitions=unit_experiences_accumulated, vf={s: 0
fixed_traces, mc_fa, gamma ) print("Result of Batch MC Prediction") print("V[A] = %.3f" % mc_vf(a)) print("V[B] = %.3f" % mc_vf(b)) fixed_transitions: Sequence[TransitionStep[str]] = \ [t for tr in fixed_traces for t in tr] td_fa: Tabular[NonTerminal[str]] = Tabular( count_to_weight_func=learning_rate_schedule( initial_learning_rate=0.1, half_life=10000, exponent=0.5 ) ) exp_replay_memory: ExperienceReplayMemory[TransitionStep[str]] = \ ExperienceReplayMemory() replay: Iterator[Sequence[TransitionStep[str]]] = \ exp_replay_memory.replay(fixed_transitions, 1) def replay_transitions(replay=replay) -> Iterator[TransitionStep[str]]: while True: yield next(replay)[0]