コード例 #1
0
def td_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    episode_length: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> Iterator[ValueFunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    td_experiences: Iterable[TransitionStep[S]] = \
        unit_experiences_from_episodes(
            episodes,
            episode_length
        )
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
    return td.td_prediction(
        transitions=td_experiences,
        approx_0=Tabular(
            values_map=initial_vf_dict,
            count_to_weight_func=learning_rate_func
        ),
        γ=gamma
    )
コード例 #2
0
ファイル: test_td.py プロジェクト: matteosantama/RL-book
    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False
コード例 #3
0
def td_prediction(experiences_stream: Iterator[TransitionStep[S]],
                  gamma: float, num_experiences: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            td.td_prediction(
                transitions=experiences_stream,
                approx_0=Tabular(count_to_weight_func=learning_rate_schedule(
                    initial_learning_rate=0.01, half_life=10000,
                    exponent=0.5)),
                γ=gamma), num_experiences)).values_map
コード例 #4
0
def td_prediction_learning_rate(
        mrp: MarkovRewardProcess[S], start_state_distribution: Distribution[S],
        gamma: float, episode_length: int,
        initial_func_approx: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        mrp_episodes_stream(mrp, start_state_distribution)
    td_experiences: Iterable[TransitionStep[S]] = \
        unit_experiences_from_episodes(
            episodes,
            episode_length
        )
    return td.td_prediction(transitions=td_experiences,
                            approx_0=initial_func_approx,
                            γ=gamma)
コード例 #5
0
ファイル: random_walk_lstd.py プロジェクト: shenoy1/RL-book
    itertools.islice(transitions, num_transitions)

initial_learning_rate: float = 0.5
half_life: float = 1000
exponent: float = 0.5
approx0: Tabular[NonTerminal[int]] = Tabular(
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent))

td_func: Tabular[NonTerminal[int]] = \
    iterate.last(itertools.islice(
        td_prediction(
            transitions=td_transitions,
            approx_0=approx0,
            γ=gamma
        ),
        num_transitions
    ))
td_vf: np.ndarray = td_func.evaluate(nt_states)

num_polynomials: int = 5
features: Sequence[Callable[[NonTerminal[int]], float]] = \
    laguerre_state_features(num_polynomials)
lstd_transitions: Iterable[TransitionStep[int]] = \
    itertools.islice(transitions, num_transitions)
epsilon: float = 1e-4

lstd_func: LinearFunctionApprox[NonTerminal[int]] = \
    least_squares_td(
コード例 #6
0
            break
    
    value_td = td_prediction_scratch(
        transitions = simulation_transitions,
        states = states,
        γ = user_gamma,
        num_transitions = 100000,
        learning_rate = 0.1
        ) 
    print("Value Function with our implementation of TD")
    print(value_td)
    

    value_td_other = td_prediction(
                transitions = simulation_transitions,
                approx_0 = approx_0,
                γ = user_gamma,
        )
    count = 0
    for i in value_td_other:
        count+=1
        if count==100000:
            print("Value Function with Function Approximation Version")
            print(i)
            break

    print("Solving Problem 4")
    from rl.chapter10.prediction_utils import compare_td_and_mc

    this_barrier_x: int = 10
    this_barrier_y: int = 10
コード例 #7
0
replay: Iterator[Sequence[TransitionStep[str]]] = \
    exp_replay_memory.replay(fixed_transitions, 1)


def replay_transitions(replay=replay) -> Iterator[TransitionStep[str]]:
    while True:
        yield next(replay)[0]


num_iterations: int = 100000

td1_vf: ValueFunctionApprox[str] = iterate.last(
    itertools.islice(
        td_prediction(
            replay_transitions(),
            td_fa,
            gamma
        ),
        num_iterations
    )
)

print("Result of Batch TD1 Prediction")
print("V[A] = %.3f" % td1_vf(a))
print("V[B] = %.3f" % td1_vf(b))

td2_vf: ValueFunctionApprox[str] = batch_td_prediction(
    fixed_transitions,
    td_fa,
    gamma
)