def td_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, episode_length: int, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[NonTerminal[S], float] ) -> Iterator[ValueFunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) td_experiences: Iterable[TransitionStep[S]] = \ unit_experiences_from_episodes( episodes, episode_length ) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) return td.td_prediction( transitions=td_experiences, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma )
def test_evaluate_finite_mrp(self) -> None: start = Tabular( {s: 0.0 for s in self.finite_flip_flop.states()}, count_to_weight_func=lambda _: 0.1, ) episode_length = 20 episodes: Iterable[Iterable[ mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces( Choose({True, False})) transitions: Iterable[ mp.TransitionStep[bool]] = itertools.chain.from_iterable( itertools.islice(episode, episode_length) for episode in episodes) vs = td.td_prediction(transitions, γ=0.99, approx_0=start) v: Optional[Tabular[bool]] = iterate.last( itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000)) if v is not None: self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 3.0) else: assert False
def td_prediction(experiences_stream: Iterator[TransitionStep[S]], gamma: float, num_experiences: int) -> Mapping[S, float]: return iterate.last( itertools.islice( td.td_prediction( transitions=experiences_stream, approx_0=Tabular(count_to_weight_func=learning_rate_schedule( initial_learning_rate=0.01, half_life=10000, exponent=0.5)), γ=gamma), num_experiences)).values_map
def td_prediction_learning_rate( mrp: MarkovRewardProcess[S], start_state_distribution: Distribution[S], gamma: float, episode_length: int, initial_func_approx: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ mrp_episodes_stream(mrp, start_state_distribution) td_experiences: Iterable[TransitionStep[S]] = \ unit_experiences_from_episodes( episodes, episode_length ) return td.td_prediction(transitions=td_experiences, approx_0=initial_func_approx, γ=gamma)
itertools.islice(transitions, num_transitions) initial_learning_rate: float = 0.5 half_life: float = 1000 exponent: float = 0.5 approx0: Tabular[NonTerminal[int]] = Tabular( count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent)) td_func: Tabular[NonTerminal[int]] = \ iterate.last(itertools.islice( td_prediction( transitions=td_transitions, approx_0=approx0, γ=gamma ), num_transitions )) td_vf: np.ndarray = td_func.evaluate(nt_states) num_polynomials: int = 5 features: Sequence[Callable[[NonTerminal[int]], float]] = \ laguerre_state_features(num_polynomials) lstd_transitions: Iterable[TransitionStep[int]] = \ itertools.islice(transitions, num_transitions) epsilon: float = 1e-4 lstd_func: LinearFunctionApprox[NonTerminal[int]] = \ least_squares_td(
break value_td = td_prediction_scratch( transitions = simulation_transitions, states = states, γ = user_gamma, num_transitions = 100000, learning_rate = 0.1 ) print("Value Function with our implementation of TD") print(value_td) value_td_other = td_prediction( transitions = simulation_transitions, approx_0 = approx_0, γ = user_gamma, ) count = 0 for i in value_td_other: count+=1 if count==100000: print("Value Function with Function Approximation Version") print(i) break print("Solving Problem 4") from rl.chapter10.prediction_utils import compare_td_and_mc this_barrier_x: int = 10 this_barrier_y: int = 10
replay: Iterator[Sequence[TransitionStep[str]]] = \ exp_replay_memory.replay(fixed_transitions, 1) def replay_transitions(replay=replay) -> Iterator[TransitionStep[str]]: while True: yield next(replay)[0] num_iterations: int = 100000 td1_vf: ValueFunctionApprox[str] = iterate.last( itertools.islice( td_prediction( replay_transitions(), td_fa, gamma ), num_iterations ) ) print("Result of Batch TD1 Prediction") print("V[A] = %.3f" % td1_vf(a)) print("V[B] = %.3f" % td1_vf(b)) td2_vf: ValueFunctionApprox[str] = batch_td_prediction( fixed_transitions, td_fa, gamma )