def mc_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    episode_length_tolerance: float,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> Iterator[ValueFunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
    return mc.mc_prediction(
        traces=episodes,
        approx_0=Tabular(
            values_map=initial_vf_dict,
            count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
        episode_length_tolerance=episode_length_tolerance
    )
def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]],
                  gamma: float, num_episodes: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            mc.mc_prediction(traces=episodes_stream,
                             approx_0=Tabular(),
                             γ=gamma,
                             tolerance=1e-10), num_episodes)).values_map
def mc_finite_prediction_equal_wts(
        fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float,
        initial_vf_dict: Mapping[S, float]) -> Iterator[FunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    return mc.mc_prediction(traces=episodes,
                            approx_0=Tabular(values_map=initial_vf_dict),
                            γ=gamma,
                            tolerance=tolerance)
def mc_prediction_learning_rate(
        mrp: MarkovRewardProcess[S], start_state_distribution: Distribution[S],
        gamma: float, tolerance: float,
        initial_func_approx: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        mrp_episodes_stream(mrp, start_state_distribution)
    return mc.mc_prediction(traces=episodes,
                            approx_0=initial_func_approx,
                            γ=gamma,
                            tolerance=tolerance)
Exemple #5
0
    def test_evaluate_finite_mrp(self):
        start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()})
        traces = self.finite_flip_flop.reward_traces(Choose({True, False}))
        v = iterate.converged(
            mc.mc_prediction(traces, γ=0.99, approx_0=start),
            # Loose bound of 0.01 to speed up test.
            done=lambda a, b: a.within(b, 0.01))

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            # Intentionally loose bound—otherwise test is too slow.
            # Takes >1s on my machine otherwise.
            self.assertLess(abs(v(s) - 170), 1.0)
Exemple #6
0
 simulation_episodes = si_mrp.reward_traces(start_state_distrib)
 simulation_transitions = si_mrp.simulate_reward(start_state_distrib)
 approx_0 = Tabular({i : 0 for i in states})
 value_mc = mc_prediction_scratch(
             traces = simulation_episodes,
             states = states,
             γ = user_gamma,
             tolerance = 1e-6,
             num_episodes = 10000
     )
 print("Value Function with our implementation of MC")
 print(value_mc)
 
 value_mc_other = mc_prediction(
             traces = simulation_episodes,
             approx_0 = approx_0,
             γ = user_gamma
     )
 count = 0
 for episode in value_mc_other:
     count+=1
     if count%1000 == 0:
         print(f"{count} episodes processed")
     if count ==10000:
         print("Value Function with Function Approximation Version")
         print(episode)
         break
 
 value_td = td_prediction_scratch(
     transitions = simulation_transitions,
     states = states,