def mc_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, episode_length_tolerance: float, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[NonTerminal[S], float] ) -> Iterator[ValueFunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) return mc.mc_prediction( traces=episodes, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, episode_length_tolerance=episode_length_tolerance )
def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]], gamma: float, num_episodes: int) -> Mapping[S, float]: return iterate.last( itertools.islice( mc.mc_prediction(traces=episodes_stream, approx_0=Tabular(), γ=gamma, tolerance=1e-10), num_episodes)).values_map
def mc_finite_prediction_equal_wts( fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float, initial_vf_dict: Mapping[S, float]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) return mc.mc_prediction(traces=episodes, approx_0=Tabular(values_map=initial_vf_dict), γ=gamma, tolerance=tolerance)
def mc_prediction_learning_rate( mrp: MarkovRewardProcess[S], start_state_distribution: Distribution[S], gamma: float, tolerance: float, initial_func_approx: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ mrp_episodes_stream(mrp, start_state_distribution) return mc.mc_prediction(traces=episodes, approx_0=initial_func_approx, γ=gamma, tolerance=tolerance)
def test_evaluate_finite_mrp(self): start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()}) traces = self.finite_flip_flop.reward_traces(Choose({True, False})) v = iterate.converged( mc.mc_prediction(traces, γ=0.99, approx_0=start), # Loose bound of 0.01 to speed up test. done=lambda a, b: a.within(b, 0.01)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 1.0)
simulation_episodes = si_mrp.reward_traces(start_state_distrib) simulation_transitions = si_mrp.simulate_reward(start_state_distrib) approx_0 = Tabular({i : 0 for i in states}) value_mc = mc_prediction_scratch( traces = simulation_episodes, states = states, γ = user_gamma, tolerance = 1e-6, num_episodes = 10000 ) print("Value Function with our implementation of MC") print(value_mc) value_mc_other = mc_prediction( traces = simulation_episodes, approx_0 = approx_0, γ = user_gamma ) count = 0 for episode in value_mc_other: count+=1 if count%1000 == 0: print(f"{count} episodes processed") if count ==10000: print("Value Function with Function Approximation Version") print(episode) break value_td = td_prediction_scratch( transitions = simulation_transitions, states = states,