Ejemplo n.º 1
0
def evaluate_mrp_tabular_bootstrap(transitions: Iterable[mp.TransitionStep[S]],
                                   vf: Dict[S, float], γ: float,
                                   n: int) -> Iterator[Dict[S, float]]:
    '''
    n-Step Bootstrapping Prediction
    for the Tabular case
    '''
    initial_learning_rate: float = 0.03
    half_life: float = 1000.0
    exponent: float = 0.5
    occurrence: Dict[S, int] = {}
    tolerance: float = γ**n  # in order to include n rewards in each bootstrap return
    bootstrap_return_steps: Iterator[mp.ReturnStep] = returns(
        transitions, γ, tolerance)

    for i, step in enumerate(bootstrap_return_steps):
        state = step.state
        bootstr_return = step.return_
        step_n = next(itertools.islice(bootstrap_return_steps, i + n, None),
                      None)
        state_n = step_n.state
        if state in occurrence:
            occurrence[state] += 1
        else:
            occurrence[state] = 1
        lr = initial_learning_rate / (1 + (
            (occurrence[state] - 1) / half_life)**exponent)
        vf[state] += lr * (bootstr_return + γ**n * vf[state_n] - vf[state])
        yield vf
Ejemplo n.º 2
0
def reinforce_gaussian(
    mdp: MarkovDecisionProcess[S, float],
    policy_mean_approx0: FunctionApprox[NonTerminal[S]],
    start_states_distribution: NTStateDistribution[S], policy_stdev: float,
    gamma: float, episode_length_tolerance: float
) -> Iterator[FunctionApprox[NonTerminal[S]]]:
    policy_mean_approx: FunctionApprox[NonTerminal[S]] = policy_mean_approx0
    yield policy_mean_approx
    while True:
        policy: Policy[S, float] = GaussianPolicyFromApprox(
            function_approx=policy_mean_approx, stdev=policy_stdev)
        trace: Iterable[TransitionStep[S, float]] = mdp.simulate_actions(
            start_states=start_states_distribution, policy=policy)
        gamma_prod: float = 1.0
        for step in returns(trace, gamma, episode_length_tolerance):

            def obj_deriv_out(states: Sequence[NonTerminal[S]],
                              actions: Sequence[float]) -> np.ndarray:
                return (policy_mean_approx.evaluate(states) -
                        np.array(actions)) / (policy_stdev * policy_stdev)
            grad: Gradient[FunctionApprox[NonTerminal[S]]] = \
                policy_mean_approx.objective_gradient(
                    xy_vals_seq=[(step.state, step.action)],
                    obj_deriv_out_fun=obj_deriv_out
            )
            scaled_grad: Gradient[FunctionApprox[NonTerminal[S]]] = \
                grad * gamma_prod * step.return_
            policy_mean_approx = \
                policy_mean_approx.update_with_gradient(scaled_grad)
            gamma_prod *= gamma
        yield policy_mean_approx
Ejemplo n.º 3
0
def mc_prediction(traces: Iterable[Iterable[mp.TransitionStep[S]]],
                  approx_0: FunctionApprox[S],
                  γ: float,
                  tolerance: float = 1e-6) -> Iterator[FunctionApprox[S]]:
    '''Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      traces -- an iterator of simulation traces from an MRP
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1), default: 1
      tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance

    Returns an iterator with updates to the approximated value
    function after each episode.

    '''
    episodes = (returns(trace, γ, tolerance) for trace in traces)

    return approx_0.iterate_updates(((step.state, step.return_)
                                     for step in episode)
                                    for episode in episodes)
Ejemplo n.º 4
0
def evaluate_mrp(
        mrp: MarkovRewardProcess[S],
        states: Distribution[S],
        approx_0: FunctionApprox[S],
        γ: float,
        tolerance: float = 1e-6
) -> Iterator[FunctionApprox[S]]:
    '''Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      mrp -- the Markov Reward Process to evaluate
      states -- distribution of states to start episodes from
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1), default: 1
      tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance

    Returns an iterator with updates to the approximated value
    function after each episode.

    '''
    v = approx_0

    for trace in mrp.reward_traces(states):
        steps = returns(trace, γ, tolerance)
        v = v.update((step.state, step.return_) for step in steps)
        yield v
Ejemplo n.º 5
0
def mc_prediction(transitions: Iterable[mp.TransitionStep[S]],
                  count_to_weight_func: Callable[[int], float],
                  gamma: float,
                  tolerance: float = 1e-200) -> Tabular[S]:
    '''
    Returns the approximated value
    function after each episode.

    Approximates Tabular MC Prediction with a discrete domain of states S, without any
    interpolation. The value function for each S is maintained as a weighted
    mean of observations by recency (managed by
    `count_to_weight_func').

    In practice, this means you can use this to approximate a function
    with a learning rate α(n) specified by count_to_weight_func.


    Fields:
    values_map -- mapping from S to its approximated value function
    counts_map -- how many times a given S has been updated
    count_to_weight_func -- function for how much to weigh an update
      to S based on the number of times that S has been updated

    Update the value approximation with the given points.
    '''
    values_map: Dict[S, float] = {}
    counts_map: Dict[S, int] = {}
    trace = []
    count = 0
    diff = {}
    max_steps = round(math.log(tolerance) / math.log(gamma))
    print('max steps: ', max_steps)
    # get trace
    for transition in transitions:
        trace.append(transition)
        count += 1
        if count >= max_steps:
            break
    # get corresponding return
    transitions_returns = returns(trace, gamma, tolerance)
    trace_returns = [return_ for return_ in transitions_returns]

    for i in range(len(trace)):
        # x: state; y: return for first n occurrences of x
        x = trace[i].state
        y = trace_returns[i].return_
        if x not in diff:
            diff[x] = 100
        diff[x] = min(abs(y - values_map.get(x, 0.)), diff[x])
        if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 1e-4:
            break
        counts_map[x] = counts_map.get(x, 0) + 1
        weight: float = count_to_weight_func(counts_map.get(x, 0))
        values_map[x] = weight * y + (1 - weight) * values_map.get(x, 0.)
    print(diff[max(diff.items(), key=operator.itemgetter(1))[0]])
    return Tabular(values_map, counts_map, count_to_weight_func)
Ejemplo n.º 6
0
def mc_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    start_state_distrib = Categorical({state: 1 for state in states})
    for i in range(num_episodes):
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            start_state_distrib, Pi)
        episode = returns(trace, γ, tolerance)
        #print(episode)
        for step in episode:
            state = step.state
            action = step.action
            return_ = step.return_
            counts_per_state_act[(state, action)] += 1
            q[(state, action)] += 1 / counts_per_state_act[
                (state, action)] * (return_ - q[(state, action)])
        eps = 1 / (i + 1)
        new_pol: Mapping[S, Optional[Categorical[A]]] = {}
        for state in states:
            if actions[state] is None:
                new_pol[state] = None
            policy_map = {
                action: eps / len(actions[state])
                for action in actions[state]
            }
            best_action = actions[state][0]
            for action in actions[state]:
                if q[(state, best_action)] <= q[(state, action)]:
                    best_action = action
            policy_map[best_action] += 1 - eps
            new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)

    return q
Ejemplo n.º 7
0
def td_lambda_tabular_prediction(
        transitions: Iterable[mp.TransitionStep[S]],
        count_to_weight_func: Callable[[int], float],
        gamma: float,
        lambd: float,
        max_steps: int = 2000,
        tolerance: float = 1e-200) -> Tuple[Tabular[S], int]:
    """
    Similar to TD Scratch except replacing use G_{t,n} for updates
    """
    values_map: Dict[S, float] = {}
    counts_map: Dict[S, int] = {}
    trace = []
    count = 0
    diff = {}  # dict: state and its value error
    for transition in transitions:
        count += 1
        trace.append(transition)
        if count > max_steps:
            break

    # get corresponding return
    transitions_returns = returns(trace, gamma, tolerance)
    trace_returns = [return_ for return_ in transitions_returns]

    for i in range(max_steps):
        transition = trace[i]
        state = transition.state
        if state not in diff:
            diff[state] = 100
        counts_map[state] = counts_map.get(state, 0) + 1
        weight: float = count_to_weight_func(counts_map.get(state, 0))
        if transition.next_state not in values_map:
            values_map[transition.next_state] = -30
        y = lambd**(max_steps - i - 1) * trace_returns[i].return_
        if lambd == 0:
            y = 0
        for n in range(1, max_steps - i):
            g_tn = 0
            for j in range(i, i + n):
                next_transition = trace[j]
                g_tn += gamma**(j - i) * next_transition.reward
                if j == i + n - 1:
                    g_tn += gamma**n * values_map.get(
                        next_transition.next_state, 0)
            y += (1 - lambd) * lambd**(n - 1) * g_tn
        diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state])
        values_map[state] = weight * y + (1 - weight) * values_map.get(
            state, 0.)
        # print(y, values_map[state])
        count += 1
        if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 0.1:
            break
    print(diff[max(diff.items(), key=operator.itemgetter(1))[0]])
    return Tabular(values_map, counts_map, count_to_weight_func), i
Ejemplo n.º 8
0
def reinforce(
    num_episodes: int,
    features_funcs: Sequence[Callable[[Tuple[S, A]], float]],
    actions: Callable[[S], Iterable[A]],
    init_wealth_distr: Gaussian,
    get_episode: Callable[[Distribution[S], Policy[S, A]],
                          Iterable[TransitionStep[S, A]]],
    γ: float,
    alpha: float,
    # softmax: bool = True
):
    def get_phi_sa(s: S, a: A):
        return np.array([f((s, a)) for f in features_funcs])

    class SoftMaxPolicy(Policy[S, A]):
        def __init__(self, theta: np.ndarray):
            self.theta = theta

        def act(self, s: S) -> Optional[Distribution[A]]:
            probs_dict = {}
            for a in actions(s):
                numerator = np.exp(np.dot(get_phi_sa(s, a), theta))
                if numerator < 0.0001:
                    continue
                denominator = np.sum([
                    np.exp(np.dot(get_phi_sa(s, b), self.theta))
                    for b in actions(s)
                    if np.exp(np.dot(get_phi_sa(s, b), self.theta)) >= 0.0001
                ])
                probs_dict[a] = numerator / denominator
            return Categorical(probs_dict)

    num_features = len(features_funcs)
    theta = np.zeros(num_features)
    for k in range(num_episodes):
        ep = get_episode(init_wealth_distr, SoftMaxPolicy(theta))
        episode = list(returns(ep, γ, γ**30))
        for t in range(len(episode)):
            s = episode[t].state
            a = episode[t].action
            phi_sa = get_phi_sa(s, a)
            normalization = sum([
                np.exp(np.dot(get_phi_sa(s, b), theta)) for b in actions(s)
                if np.exp(np.dot(get_phi_sa(s, b), theta)) >= 0.0001
            ])
            sum_pi = sum([
                np.exp(np.dot(get_phi_sa(s, b), theta)) * get_phi_sa(s, b)
                for b in actions(s)
                if np.exp(np.dot(get_phi_sa(s, b), theta)) >= 0.0001
            ])
            derivative = phi_sa - sum_pi / normalization
            theta += alpha * γ**t * derivative * episode[t].return_
    return theta
Ejemplo n.º 9
0
def batch_mc_prediction(
        traces: Iterable[Iterable[mp.TransitionStep[S]]],
        approx: ValueFunctionApprox[S],
        γ: float,
        episode_length_tolerance: float = 1e-6,
        convergence_tolerance: float = 1e-5) -> ValueFunctionApprox[S]:
    '''traces is a finite iterable'''
    return_steps: Iterable[mp.ReturnStep[S]] = \
        itertools.chain.from_iterable(
            returns(trace, γ, episode_length_tolerance) for trace in traces
        )
    return approx.solve([(step.state, step.return_) for step in return_steps],
                        convergence_tolerance)
Ejemplo n.º 10
0
def get_episode(mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A],
                start_state_dist: dist.Categorical[S],
                policy: AssetAllocPolicy[S, A], gamma: float,
                tolerance: float) -> Sequence[mdp.TransitionStep[S, A]]:
    """Generate an episode from the asset allocation MDP.

    :param mdp_obj: MDP representation of the asset allocation problem
    :param start_state_dist: Starting state distribution
    :param policy: The policy with which to simulate the episode
    :param gamma: Discount factor
    :param tolerance: Accumulated discount factor below which simulation
        terminates
    :returns: Sequence of transition steps from the episode
    """
    episode_iterator = mdp_obj.simulate_actions(start_state_dist, policy)
    return list(returns(episode_iterator, gamma, tolerance))
Ejemplo n.º 11
0
def mc_control_fapprox(
        mdp_to_sample: MarkovDecisionProcess[S, A],
        states: Distribution[S],
        approx_0: FunctionApprox[Tuple[S, A]],
        γ: float,
        ϵ: float,
        tolerance: float = 1e-6) -> Iterator[FunctionApprox[Tuple[S, A]]]:

    q = approx_0
    Pi = policy_from_q(q, mdp_to_sample)

    while True:
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            states, Pi)
        q = q.update(((step.state, step.action), step.return_)
                     for step in returns(trace, γ, tolerance))
        Pi = policy_from_q(q, mdp_to_sample, ϵ)
        yield q
Ejemplo n.º 12
0
def mc_prediction_scratch(
        traces: Iterable[Iterable[mp.TransitionStep[S]]],
        states: List[S],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes:float = 10000
) -> Mapping[S,float]:
    
    '''
    Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      traces -- an iterator of simulation traces from an MRP
      states -- list of all possible states
      γ -- discount rate (0 < γ ≤ 1), default: 1
      tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance
    Returns a 

    '''
    v:Mapping[S,float] = {}
    counts_per_state:Mapping[S,int] = {}
    for state in states:
        v[state] = 0.
        counts_per_state[state] = 0
    episodes = (returns(trace, γ, tolerance) for trace in traces)
    count_episodes = 0
    for episode in episodes:
        count_episodes += 1
        if count_episodes>num_episodes:
            break
        if count_episodes%1000 == 0:
            print(f"{count_episodes} episodes processed")
        for step in episode:
            count:int = counts_per_state[step.state]
            v[step.state] = v[step.state]*(count/(count+1))+1/(count+1)*step.return_
            counts_per_state[step.state] = count + 1

    return v
Ejemplo n.º 13
0
def glie_mc_control(
    mdp: MarkovDecisionProcess[S, A],
    states: NTStateDistribution[S],
    approx_0: QValueFunctionApprox[S, A],
    γ: float,
    ϵ_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-6
) -> Iterator[QValueFunctionApprox[S, A]]:
    '''Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      mdp -- the Markov Decision Process to evaluate
      states -- distribution of states to start episodes from
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 ≤ γ ≤ 1)
      ϵ_as_func_of_episodes -- a function from the number of episodes
      to epsilon. epsilon is the fraction of the actions where we explore
      rather than following the optimal policy
      episode_length_tolerance -- stop iterating once γᵏ ≤ tolerance

    Returns an iterator with updates to the approximated Q function
    after each episode.

    '''
    q: QValueFunctionApprox[S, A] = approx_0
    p: Policy[S, A] = epsilon_greedy_policy(q, mdp, 1.0)
    yield q

    num_episodes: int = 0
    while True:
        trace: Iterable[TransitionStep[S, A]] = \
            mdp.simulate_actions(states, p)
        num_episodes += 1
        for step in returns(trace, γ, episode_length_tolerance):
            q = q.update([((step.state, step.action), step.return_)])
        p = epsilon_greedy_policy(q, mdp, ϵ_as_func_of_episodes(num_episodes))
        yield q
Ejemplo n.º 14
0
def evaluate_mdp(
        mdp: MarkovDecisionProcess[S, A],
        states: Distribution[S],
        approx_0: FunctionApprox[Tuple[S, A]],
        γ: float,
        ϵ: float,
        tolerance: float = 1e-6
) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    '''Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      mrp -- the Markov Reward Process to evaluate
      states -- distribution of states to start episodes from
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1), default: 1
      ϵ -- the fraction of the actions where we explore rather
      than following the optimal policy
      tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance

    Returns an iterator with updates to the approximated Q function
    after each episode.

    '''
    q = approx_0
    p = markov_decision_process.policy_from_q(q, mdp)

    while True:
        trace: Iterable[markov_decision_process.TransitionStep[S, A]] =\
            mdp.simulate_actions(states, p)
        q = q.update(
            ((step.state, step.action), step.return_)
            for step in returns(trace, γ, tolerance)
        )
        p = markov_decision_process.policy_from_q(q, mdp, ϵ)
        yield q
Ejemplo n.º 15
0
def evaluate_mrp_mc(
        traces: Iterable[Iterable[mp.TransitionStep[S]]],
        vf: Dict[S,float],
        γ: float,
        tolerance: float = 1e-6
) -> Iterator[Dict[S,float]]:

    episodes: Iterator[Iterator[mp.ReturnStep]] = \
        (returns(trace, γ, tolerance) for trace in traces)
    occurrence: Dict[S,int] = {}

    for episode in episodes:
        for return_step in episode:
            old_vf = deepcopy(vf)
            state = return_step.state
            if state in occurrence:
                occurrence[state] += 1
            else:
                occurrence[state] = 1
            weight_f: float = 1/occurrence[state]
            vf[state] = (1-weight_f) * old_vf[state] + weight_f * return_step.return_
        yield vf
Ejemplo n.º 16
0
def evaluate_mrp_funapprox_bootstrap(transitions: Iterable[
    mp.TransitionStep[S]], approx_0: FunctionApprox[S], γ: float,
                                     n: int) -> Iterator[FunctionApprox[S]]:
    '''
    n-Step Bootstrapping Prediction
    for the Function Approximation case
    '''
    tolerance: float = γ**n  # in order to include n rewards in each bootstrap return
    bootstrap_return_steps: Iterator[mp.ReturnStep] = returns(
        transitions, γ, tolerance)
    bootstr_return_steps_indexed = zip(itertools.count(),
                                       bootstrap_return_steps)

    def step(v, indexed_return_step):
        index = indexed_return_step[0]
        step = indexed_return_step[1]
        step_n = next(
            itertools.islice(bootstrap_return_steps, index + n, None), None)
        return v.update([(step.state,
                          step.return_ + γ**n * v(step_n.next_state))])

    return iterate.accumulate(bootstr_return_steps_indexed,
                              step,
                              initial=approx_0)
Ejemplo n.º 17
0
def get_return_steps_from_fixed_episodes(
        fixed_episodes: Sequence[Sequence[TransitionStep[S]]],
        gamma: float) -> Sequence[ReturnStep[S]]:
    return list(
        itertools.chain.from_iterable(
            returns(episode, gamma, 1e-8) for episode in fixed_episodes))
Ejemplo n.º 18
0
def compare_mc_sarsa_ql(fmdp: FiniteMarkovDecisionProcess[S, A],
                        method_mask: Tuple[bool, bool, bool],
                        learning_rates: Sequence[Tuple[float, float,
                                                       float]], gamma: float,
                        epsilon_as_func_of_episodes: Callable[[int], float],
                        q_learning_epsilon: float,
                        mc_episode_length_tol: float, num_episodes: int,
                        plot_batch: int, plot_start: int) -> None:
    true_vf: V[S] = value_iteration_result(fmdp, gamma)[0]
    states: Sequence[NonTerminal[S]] = fmdp.non_terminal_states
    colors: Sequence[str] = ['b', 'g', 'r', 'k', 'c', 'm', 'y']

    import matplotlib.pyplot as plt
    plt.figure(figsize=(11, 7))

    if method_mask[0]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            mc_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \
                glie_mc_finite_control_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
                    episode_length_tolerance=mc_episode_length_tol
                )
            mc_errors = []
            batch_mc_errs = []
            for i, mc_qvf in enumerate(
                    itertools.islice(mc_funcs_it, num_episodes)):
                mc_vf: V[S] = {
                    s: max(mc_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_mc_errs.append(
                    sqrt(
                        sum((mc_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % plot_batch == plot_batch - 1:
                    mc_errors.append(sum(batch_mc_errs) / plot_batch)
                    batch_mc_errs = []
            mc_plot = mc_errors[plot_start:]
            label = f"MC InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(mc_plot)),
                     mc_plot,
                     color=colors[k],
                     linestyle='-',
                     label=label)

    sample_episodes: int = 1000
    uniform_policy: FinitePolicy[S, A] = \
        FinitePolicy(
            {s.state: Choose(fmdp.actions(s)) for s in states}
    )
    fmrp: FiniteMarkovRewardProcess[S] = \
        fmdp.apply_finite_policy(uniform_policy)
    td_episode_length: int = int(
        round(
            sum(
                len(
                    list(
                        returns(trace=fmrp.simulate_reward(Choose(states)),
                                γ=gamma,
                                tolerance=mc_episode_length_tol)))
                for _ in range(sample_episodes)) / sample_episodes))

    if method_mask[1]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            sarsa_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \
                glie_sarsa_finite_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
                    max_episode_length=td_episode_length,
                )
            sarsa_errors = []
            transitions_batch = plot_batch * td_episode_length
            batch_sarsa_errs = []

            for i, sarsa_qvf in enumerate(
                    itertools.islice(sarsa_funcs_it,
                                     num_episodes * td_episode_length)):
                sarsa_vf: V[S] = {
                    s: max(sarsa_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_sarsa_errs.append(
                    sqrt(
                        sum((sarsa_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % transitions_batch == transitions_batch - 1:
                    sarsa_errors.append(
                        sum(batch_sarsa_errs) / transitions_batch)
                    batch_sarsa_errs = []
            sarsa_plot = sarsa_errors[plot_start:]
            label = f"SARSA InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(sarsa_plot)),
                     sarsa_plot,
                     color=colors[k],
                     linestyle='--',
                     label=label)

    if method_mask[2]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            ql_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \
                q_learning_finite_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon=q_learning_epsilon,
                    max_episode_length=td_episode_length,
                )
            ql_errors = []
            transitions_batch = plot_batch * td_episode_length
            batch_ql_errs = []

            for i, ql_qvf in enumerate(
                    itertools.islice(ql_funcs_it,
                                     num_episodes * td_episode_length)):
                ql_vf: V[S] = {
                    s: max(ql_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_ql_errs.append(
                    sqrt(
                        sum((ql_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % transitions_batch == transitions_batch - 1:
                    ql_errors.append(sum(batch_ql_errs) / transitions_batch)
                    batch_ql_errs = []
            ql_plot = ql_errors[plot_start:]
            label = f"Q-Learning InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(ql_plot)),
                     ql_plot,
                     color=colors[k],
                     linestyle=':',
                     label=label)

    plt.xlabel("Episode Batches", fontsize=20)
    plt.ylabel("Optimal Value Function RMSE", fontsize=20)
    plt.title("RMSE as function of episode batches", fontsize=20)
    plt.grid(True)
    plt.legend(fontsize=10)
    plt.show()
Ejemplo n.º 19
0
def compare_td_and_mc(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    mc_episode_length_tol: float,
    num_episodes: int,
    learning_rates: Sequence[Tuple[float, float, float]],
    initial_vf_dict: Mapping[NonTerminal[S], float],
    plot_batch: int,
    plot_start: int
) -> None:
    true_vf: np.ndarray = fmrp.get_value_function_vec(gamma)
    states: Sequence[NonTerminal[S]] = fmrp.non_terminal_states
    colors: Sequence[str] = ['r', 'y', 'm', 'g', 'c', 'k', 'b']

    import matplotlib.pyplot as plt
    plt.figure(figsize=(11, 7))

    for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
        mc_funcs_it: Iterator[ValueFunctionApprox[S]] = \
            mc_finite_prediction_learning_rate(
                fmrp=fmrp,
                gamma=gamma,
                episode_length_tolerance=mc_episode_length_tol,
                initial_learning_rate=init_lr,
                half_life=half_life,
                exponent=exponent,
                initial_vf_dict=initial_vf_dict
            )
        mc_errors = []
        batch_mc_errs = []
        for i, mc_f in enumerate(itertools.islice(mc_funcs_it, num_episodes)):
            batch_mc_errs.append(sqrt(sum(
                (mc_f(s) - true_vf[j]) ** 2 for j, s in enumerate(states)
            ) / len(states)))
            if i % plot_batch == plot_batch - 1:
                mc_errors.append(sum(batch_mc_errs) / plot_batch)
                batch_mc_errs = []
        mc_plot = mc_errors[plot_start:]
        label = f"MC InitRate={init_lr:.3f},HalfLife" + \
            f"={half_life:.0f},Exp={exponent:.1f}"
        plt.plot(
            range(len(mc_plot)),
            mc_plot,
            color=colors[k],
            linestyle='-',
            label=label
        )

    sample_episodes: int = 1000
    td_episode_length: int = int(round(sum(
        len(list(returns(
            trace=fmrp.simulate_reward(Choose(states)),
            γ=gamma,
            tolerance=mc_episode_length_tol
        ))) for _ in range(sample_episodes)
    ) / sample_episodes))

    for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
        td_funcs_it: Iterator[ValueFunctionApprox[S]] = \
            td_finite_prediction_learning_rate(
                fmrp=fmrp,
                gamma=gamma,
                episode_length=td_episode_length,
                initial_learning_rate=init_lr,
                half_life=half_life,
                exponent=exponent,
                initial_vf_dict=initial_vf_dict
            )
        td_errors = []
        transitions_batch = plot_batch * td_episode_length
        batch_td_errs = []

        for i, td_f in enumerate(
                itertools.islice(td_funcs_it, num_episodes * td_episode_length)
        ):
            batch_td_errs.append(sqrt(sum(
                (td_f(s) - true_vf[j]) ** 2 for j, s in enumerate(states)
            ) / len(states)))
            if i % transitions_batch == transitions_batch - 1:
                td_errors.append(sum(batch_td_errs) / transitions_batch)
                batch_td_errs = []
        td_plot = td_errors[plot_start:]
        label = f"TD InitRate={init_lr:.3f},HalfLife" + \
            f"={half_life:.0f},Exp={exponent:.1f}"
        plt.plot(
            range(len(td_plot)),
            td_plot,
            color=colors[k],
            linestyle='--',
            label=label
        )

    plt.xlabel("Episode Batches", fontsize=20)
    plt.ylabel("Value Function RMSE", fontsize=20)
    plt.title(
        "RMSE of MC and TD as function of episode batches",
        fontsize=25
    )
    plt.grid(True)
    plt.legend(fontsize=10)
    plt.show()