def get_q_learning_vf_and_policy( self, epsilon: float, learning_rate: float, num_updates: int) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]: qvfs: Iterator[FunctionApprox[Tuple[Cell, Move]]] = \ q_learning_finite_learning_rate( fmdp=self.get_finite_mdp(), initial_learning_rate=learning_rate, half_life=1e8, exponent=1.0, gamma=1.0, epsilon=epsilon, max_episode_length=int(1e8) ) final_qvf: FunctionApprox[Tuple[Cell, Move]] = \ iterate.last(itertools.islice(qvfs, num_updates)) return get_vf_and_policy_from_qvf(mdp=self.get_finite_mdp(), qvf=final_qvf)
def lspi_vf_and_policy(self) -> \ Tuple[V[int], FiniteDeterministicPolicy[int, int]]: transitions: Iterable[TransitionStep[int, int]] = itertools.islice( self.lspi_transitions(), 50000) qvf_iter: Iterator[LinearFunctionApprox[Tuple[ NonTerminal[int], int]]] = least_squares_policy_iteration( transitions=transitions, actions=self.actions, feature_functions=self.lspi_features(4, 4), initial_target_policy=DeterministicPolicy( lambda s: int(s / 2)), γ=1.0, ε=1e-5) qvf: LinearFunctionApprox[Tuple[NonTerminal[int], int]] = \ iterate.last( itertools.islice( qvf_iter, 100 ) ) return get_vf_and_policy_from_qvf(self, qvf)
def get_glie_sarsa_vf_and_policy( self, epsilon_as_func_of_episodes: Callable[[int], float], learning_rate: float, num_updates: int ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]: qvfs: Iterator[QValueFunctionApprox[Cell, Move]] = \ glie_sarsa_finite_learning_rate( fmdp=self.get_finite_mdp(), initial_learning_rate=learning_rate, half_life=1e8, exponent=1.0, gamma=1.0, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=int(1e8) ) final_qvf: QValueFunctionApprox[Cell, Move] = \ iterate.last(itertools.islice(qvfs, num_updates)) return get_vf_and_policy_from_qvf( mdp=self.get_finite_mdp(), qvf=final_qvf )
q_learning_experience_replay( mdp=si_mdp, policy_from_q=lambda f, m: epsilon_greedy_policy( q=f, mdp=m, ϵ=epsilon ), states=Choose(si_mdp.non_terminal_states), approx_0=Tabular( count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=learning_rate_half_life, exponent=learning_rate_exponent ) ), γ=gamma, max_episode_length=episode_length, mini_batch_size=mini_batch_size, weights_decay_half_life=time_decay_half_life ) qvf: QValueFunctionApprox[InventoryState, int] = iterate.last( itertools.islice(q_iter, num_updates)) vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf) pprint(vf) print(pol) true_vf, true_pol = value_iteration_result(mdp=si_mdp, gamma=gamma) pprint(true_vf) print(true_pol)