def evaluate_mrp_tabular_bootstrap(transitions: Iterable[mp.TransitionStep[S]], vf: Dict[S, float], γ: float, n: int) -> Iterator[Dict[S, float]]: ''' n-Step Bootstrapping Prediction for the Tabular case ''' initial_learning_rate: float = 0.03 half_life: float = 1000.0 exponent: float = 0.5 occurrence: Dict[S, int] = {} tolerance: float = γ**n # in order to include n rewards in each bootstrap return bootstrap_return_steps: Iterator[mp.ReturnStep] = returns( transitions, γ, tolerance) for i, step in enumerate(bootstrap_return_steps): state = step.state bootstr_return = step.return_ step_n = next(itertools.islice(bootstrap_return_steps, i + n, None), None) state_n = step_n.state if state in occurrence: occurrence[state] += 1 else: occurrence[state] = 1 lr = initial_learning_rate / (1 + ( (occurrence[state] - 1) / half_life)**exponent) vf[state] += lr * (bootstr_return + γ**n * vf[state_n] - vf[state]) yield vf
def reinforce_gaussian( mdp: MarkovDecisionProcess[S, float], policy_mean_approx0: FunctionApprox[NonTerminal[S]], start_states_distribution: NTStateDistribution[S], policy_stdev: float, gamma: float, episode_length_tolerance: float ) -> Iterator[FunctionApprox[NonTerminal[S]]]: policy_mean_approx: FunctionApprox[NonTerminal[S]] = policy_mean_approx0 yield policy_mean_approx while True: policy: Policy[S, float] = GaussianPolicyFromApprox( function_approx=policy_mean_approx, stdev=policy_stdev) trace: Iterable[TransitionStep[S, float]] = mdp.simulate_actions( start_states=start_states_distribution, policy=policy) gamma_prod: float = 1.0 for step in returns(trace, gamma, episode_length_tolerance): def obj_deriv_out(states: Sequence[NonTerminal[S]], actions: Sequence[float]) -> np.ndarray: return (policy_mean_approx.evaluate(states) - np.array(actions)) / (policy_stdev * policy_stdev) grad: Gradient[FunctionApprox[NonTerminal[S]]] = \ policy_mean_approx.objective_gradient( xy_vals_seq=[(step.state, step.action)], obj_deriv_out_fun=obj_deriv_out ) scaled_grad: Gradient[FunctionApprox[NonTerminal[S]]] = \ grad * gamma_prod * step.return_ policy_mean_approx = \ policy_mean_approx.update_with_gradient(scaled_grad) gamma_prod *= gamma yield policy_mean_approx
def mc_prediction(traces: Iterable[Iterable[mp.TransitionStep[S]]], approx_0: FunctionApprox[S], γ: float, tolerance: float = 1e-6) -> Iterator[FunctionApprox[S]]: '''Evaluate an MRP using the monte carlo method, simulating episodes of the given number of steps. Each value this function yields represents the approximated value function for the MRP after one additional epsiode. Arguments: traces -- an iterator of simulation traces from an MRP approx_0 -- initial approximation of value function γ -- discount rate (0 < γ ≤ 1), default: 1 tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance Returns an iterator with updates to the approximated value function after each episode. ''' episodes = (returns(trace, γ, tolerance) for trace in traces) return approx_0.iterate_updates(((step.state, step.return_) for step in episode) for episode in episodes)
def evaluate_mrp( mrp: MarkovRewardProcess[S], states: Distribution[S], approx_0: FunctionApprox[S], γ: float, tolerance: float = 1e-6 ) -> Iterator[FunctionApprox[S]]: '''Evaluate an MRP using the monte carlo method, simulating episodes of the given number of steps. Each value this function yields represents the approximated value function for the MRP after one additional epsiode. Arguments: mrp -- the Markov Reward Process to evaluate states -- distribution of states to start episodes from approx_0 -- initial approximation of value function γ -- discount rate (0 < γ ≤ 1), default: 1 tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance Returns an iterator with updates to the approximated value function after each episode. ''' v = approx_0 for trace in mrp.reward_traces(states): steps = returns(trace, γ, tolerance) v = v.update((step.state, step.return_) for step in steps) yield v
def mc_prediction(transitions: Iterable[mp.TransitionStep[S]], count_to_weight_func: Callable[[int], float], gamma: float, tolerance: float = 1e-200) -> Tabular[S]: ''' Returns the approximated value function after each episode. Approximates Tabular MC Prediction with a discrete domain of states S, without any interpolation. The value function for each S is maintained as a weighted mean of observations by recency (managed by `count_to_weight_func'). In practice, this means you can use this to approximate a function with a learning rate α(n) specified by count_to_weight_func. Fields: values_map -- mapping from S to its approximated value function counts_map -- how many times a given S has been updated count_to_weight_func -- function for how much to weigh an update to S based on the number of times that S has been updated Update the value approximation with the given points. ''' values_map: Dict[S, float] = {} counts_map: Dict[S, int] = {} trace = [] count = 0 diff = {} max_steps = round(math.log(tolerance) / math.log(gamma)) print('max steps: ', max_steps) # get trace for transition in transitions: trace.append(transition) count += 1 if count >= max_steps: break # get corresponding return transitions_returns = returns(trace, gamma, tolerance) trace_returns = [return_ for return_ in transitions_returns] for i in range(len(trace)): # x: state; y: return for first n occurrences of x x = trace[i].state y = trace_returns[i].return_ if x not in diff: diff[x] = 100 diff[x] = min(abs(y - values_map.get(x, 0.)), diff[x]) if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 1e-4: break counts_map[x] = counts_map.get(x, 0) + 1 weight: float = count_to_weight_func(counts_map.get(x, 0)) values_map[x] = weight * y + (1 - weight) * values_map.get(x, 0.) print(diff[max(diff.items(), key=operator.itemgetter(1))[0]]) return Tabular(values_map, counts_map, count_to_weight_func)
def mc_control_scratch( #traces: Iterable[Iterable[mp.TransitionStep[S]]], mdp_to_sample: FiniteMarkovDecisionProcess, states: List[S], actions: Mapping[S, List[A]], γ: float, tolerance: float = 1e-6, num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]: q: Mapping[Tuple[S, A], float] = {} counts_per_state_act: Mapping[Tuple[S, A], int] = {} for state in states: for action in actions[state]: q[(state, action)] = 0. counts_per_state_act[(state, action)] = 0 policy_map: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: policy_map[state] = None else: policy_map[state] = Categorical( {action: 1 for action in actions[state]}) Pi: FinitePolicy[S, A] = FinitePolicy(policy_map) start_state_distrib = Categorical({state: 1 for state in states}) for i in range(num_episodes): trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions( start_state_distrib, Pi) episode = returns(trace, γ, tolerance) #print(episode) for step in episode: state = step.state action = step.action return_ = step.return_ counts_per_state_act[(state, action)] += 1 q[(state, action)] += 1 / counts_per_state_act[ (state, action)] * (return_ - q[(state, action)]) eps = 1 / (i + 1) new_pol: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: new_pol[state] = None policy_map = { action: eps / len(actions[state]) for action in actions[state] } best_action = actions[state][0] for action in actions[state]: if q[(state, best_action)] <= q[(state, action)]: best_action = action policy_map[best_action] += 1 - eps new_pol[state] = Categorical(policy_map) Pi = FinitePolicy(new_pol) return q
def td_lambda_tabular_prediction( transitions: Iterable[mp.TransitionStep[S]], count_to_weight_func: Callable[[int], float], gamma: float, lambd: float, max_steps: int = 2000, tolerance: float = 1e-200) -> Tuple[Tabular[S], int]: """ Similar to TD Scratch except replacing use G_{t,n} for updates """ values_map: Dict[S, float] = {} counts_map: Dict[S, int] = {} trace = [] count = 0 diff = {} # dict: state and its value error for transition in transitions: count += 1 trace.append(transition) if count > max_steps: break # get corresponding return transitions_returns = returns(trace, gamma, tolerance) trace_returns = [return_ for return_ in transitions_returns] for i in range(max_steps): transition = trace[i] state = transition.state if state not in diff: diff[state] = 100 counts_map[state] = counts_map.get(state, 0) + 1 weight: float = count_to_weight_func(counts_map.get(state, 0)) if transition.next_state not in values_map: values_map[transition.next_state] = -30 y = lambd**(max_steps - i - 1) * trace_returns[i].return_ if lambd == 0: y = 0 for n in range(1, max_steps - i): g_tn = 0 for j in range(i, i + n): next_transition = trace[j] g_tn += gamma**(j - i) * next_transition.reward if j == i + n - 1: g_tn += gamma**n * values_map.get( next_transition.next_state, 0) y += (1 - lambd) * lambd**(n - 1) * g_tn diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state]) values_map[state] = weight * y + (1 - weight) * values_map.get( state, 0.) # print(y, values_map[state]) count += 1 if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 0.1: break print(diff[max(diff.items(), key=operator.itemgetter(1))[0]]) return Tabular(values_map, counts_map, count_to_weight_func), i
def reinforce( num_episodes: int, features_funcs: Sequence[Callable[[Tuple[S, A]], float]], actions: Callable[[S], Iterable[A]], init_wealth_distr: Gaussian, get_episode: Callable[[Distribution[S], Policy[S, A]], Iterable[TransitionStep[S, A]]], γ: float, alpha: float, # softmax: bool = True ): def get_phi_sa(s: S, a: A): return np.array([f((s, a)) for f in features_funcs]) class SoftMaxPolicy(Policy[S, A]): def __init__(self, theta: np.ndarray): self.theta = theta def act(self, s: S) -> Optional[Distribution[A]]: probs_dict = {} for a in actions(s): numerator = np.exp(np.dot(get_phi_sa(s, a), theta)) if numerator < 0.0001: continue denominator = np.sum([ np.exp(np.dot(get_phi_sa(s, b), self.theta)) for b in actions(s) if np.exp(np.dot(get_phi_sa(s, b), self.theta)) >= 0.0001 ]) probs_dict[a] = numerator / denominator return Categorical(probs_dict) num_features = len(features_funcs) theta = np.zeros(num_features) for k in range(num_episodes): ep = get_episode(init_wealth_distr, SoftMaxPolicy(theta)) episode = list(returns(ep, γ, γ**30)) for t in range(len(episode)): s = episode[t].state a = episode[t].action phi_sa = get_phi_sa(s, a) normalization = sum([ np.exp(np.dot(get_phi_sa(s, b), theta)) for b in actions(s) if np.exp(np.dot(get_phi_sa(s, b), theta)) >= 0.0001 ]) sum_pi = sum([ np.exp(np.dot(get_phi_sa(s, b), theta)) * get_phi_sa(s, b) for b in actions(s) if np.exp(np.dot(get_phi_sa(s, b), theta)) >= 0.0001 ]) derivative = phi_sa - sum_pi / normalization theta += alpha * γ**t * derivative * episode[t].return_ return theta
def batch_mc_prediction( traces: Iterable[Iterable[mp.TransitionStep[S]]], approx: ValueFunctionApprox[S], γ: float, episode_length_tolerance: float = 1e-6, convergence_tolerance: float = 1e-5) -> ValueFunctionApprox[S]: '''traces is a finite iterable''' return_steps: Iterable[mp.ReturnStep[S]] = \ itertools.chain.from_iterable( returns(trace, γ, episode_length_tolerance) for trace in traces ) return approx.solve([(step.state, step.return_) for step in return_steps], convergence_tolerance)
def get_episode(mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A], start_state_dist: dist.Categorical[S], policy: AssetAllocPolicy[S, A], gamma: float, tolerance: float) -> Sequence[mdp.TransitionStep[S, A]]: """Generate an episode from the asset allocation MDP. :param mdp_obj: MDP representation of the asset allocation problem :param start_state_dist: Starting state distribution :param policy: The policy with which to simulate the episode :param gamma: Discount factor :param tolerance: Accumulated discount factor below which simulation terminates :returns: Sequence of transition steps from the episode """ episode_iterator = mdp_obj.simulate_actions(start_state_dist, policy) return list(returns(episode_iterator, gamma, tolerance))
def mc_control_fapprox( mdp_to_sample: MarkovDecisionProcess[S, A], states: Distribution[S], approx_0: FunctionApprox[Tuple[S, A]], γ: float, ϵ: float, tolerance: float = 1e-6) -> Iterator[FunctionApprox[Tuple[S, A]]]: q = approx_0 Pi = policy_from_q(q, mdp_to_sample) while True: trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions( states, Pi) q = q.update(((step.state, step.action), step.return_) for step in returns(trace, γ, tolerance)) Pi = policy_from_q(q, mdp_to_sample, ϵ) yield q
def mc_prediction_scratch( traces: Iterable[Iterable[mp.TransitionStep[S]]], states: List[S], γ: float, tolerance: float = 1e-6, num_episodes:float = 10000 ) -> Mapping[S,float]: ''' Evaluate an MRP using the monte carlo method, simulating episodes of the given number of steps. Each value this function yields represents the approximated value function for the MRP after one additional epsiode. Arguments: traces -- an iterator of simulation traces from an MRP states -- list of all possible states γ -- discount rate (0 < γ ≤ 1), default: 1 tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance Returns a ''' v:Mapping[S,float] = {} counts_per_state:Mapping[S,int] = {} for state in states: v[state] = 0. counts_per_state[state] = 0 episodes = (returns(trace, γ, tolerance) for trace in traces) count_episodes = 0 for episode in episodes: count_episodes += 1 if count_episodes>num_episodes: break if count_episodes%1000 == 0: print(f"{count_episodes} episodes processed") for step in episode: count:int = counts_per_state[step.state] v[step.state] = v[step.state]*(count/(count+1))+1/(count+1)*step.return_ counts_per_state[step.state] = count + 1 return v
def glie_mc_control( mdp: MarkovDecisionProcess[S, A], states: NTStateDistribution[S], approx_0: QValueFunctionApprox[S, A], γ: float, ϵ_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float = 1e-6 ) -> Iterator[QValueFunctionApprox[S, A]]: '''Evaluate an MRP using the monte carlo method, simulating episodes of the given number of steps. Each value this function yields represents the approximated value function for the MRP after one additional epsiode. Arguments: mdp -- the Markov Decision Process to evaluate states -- distribution of states to start episodes from approx_0 -- initial approximation of value function γ -- discount rate (0 ≤ γ ≤ 1) ϵ_as_func_of_episodes -- a function from the number of episodes to epsilon. epsilon is the fraction of the actions where we explore rather than following the optimal policy episode_length_tolerance -- stop iterating once γᵏ ≤ tolerance Returns an iterator with updates to the approximated Q function after each episode. ''' q: QValueFunctionApprox[S, A] = approx_0 p: Policy[S, A] = epsilon_greedy_policy(q, mdp, 1.0) yield q num_episodes: int = 0 while True: trace: Iterable[TransitionStep[S, A]] = \ mdp.simulate_actions(states, p) num_episodes += 1 for step in returns(trace, γ, episode_length_tolerance): q = q.update([((step.state, step.action), step.return_)]) p = epsilon_greedy_policy(q, mdp, ϵ_as_func_of_episodes(num_episodes)) yield q
def evaluate_mdp( mdp: MarkovDecisionProcess[S, A], states: Distribution[S], approx_0: FunctionApprox[Tuple[S, A]], γ: float, ϵ: float, tolerance: float = 1e-6 ) -> Iterator[FunctionApprox[Tuple[S, A]]]: '''Evaluate an MRP using the monte carlo method, simulating episodes of the given number of steps. Each value this function yields represents the approximated value function for the MRP after one additional epsiode. Arguments: mrp -- the Markov Reward Process to evaluate states -- distribution of states to start episodes from approx_0 -- initial approximation of value function γ -- discount rate (0 < γ ≤ 1), default: 1 ϵ -- the fraction of the actions where we explore rather than following the optimal policy tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance Returns an iterator with updates to the approximated Q function after each episode. ''' q = approx_0 p = markov_decision_process.policy_from_q(q, mdp) while True: trace: Iterable[markov_decision_process.TransitionStep[S, A]] =\ mdp.simulate_actions(states, p) q = q.update( ((step.state, step.action), step.return_) for step in returns(trace, γ, tolerance) ) p = markov_decision_process.policy_from_q(q, mdp, ϵ) yield q
def evaluate_mrp_mc( traces: Iterable[Iterable[mp.TransitionStep[S]]], vf: Dict[S,float], γ: float, tolerance: float = 1e-6 ) -> Iterator[Dict[S,float]]: episodes: Iterator[Iterator[mp.ReturnStep]] = \ (returns(trace, γ, tolerance) for trace in traces) occurrence: Dict[S,int] = {} for episode in episodes: for return_step in episode: old_vf = deepcopy(vf) state = return_step.state if state in occurrence: occurrence[state] += 1 else: occurrence[state] = 1 weight_f: float = 1/occurrence[state] vf[state] = (1-weight_f) * old_vf[state] + weight_f * return_step.return_ yield vf
def evaluate_mrp_funapprox_bootstrap(transitions: Iterable[ mp.TransitionStep[S]], approx_0: FunctionApprox[S], γ: float, n: int) -> Iterator[FunctionApprox[S]]: ''' n-Step Bootstrapping Prediction for the Function Approximation case ''' tolerance: float = γ**n # in order to include n rewards in each bootstrap return bootstrap_return_steps: Iterator[mp.ReturnStep] = returns( transitions, γ, tolerance) bootstr_return_steps_indexed = zip(itertools.count(), bootstrap_return_steps) def step(v, indexed_return_step): index = indexed_return_step[0] step = indexed_return_step[1] step_n = next( itertools.islice(bootstrap_return_steps, index + n, None), None) return v.update([(step.state, step.return_ + γ**n * v(step_n.next_state))]) return iterate.accumulate(bootstr_return_steps_indexed, step, initial=approx_0)
def get_return_steps_from_fixed_episodes( fixed_episodes: Sequence[Sequence[TransitionStep[S]]], gamma: float) -> Sequence[ReturnStep[S]]: return list( itertools.chain.from_iterable( returns(episode, gamma, 1e-8) for episode in fixed_episodes))
def compare_mc_sarsa_ql(fmdp: FiniteMarkovDecisionProcess[S, A], method_mask: Tuple[bool, bool, bool], learning_rates: Sequence[Tuple[float, float, float]], gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], q_learning_epsilon: float, mc_episode_length_tol: float, num_episodes: int, plot_batch: int, plot_start: int) -> None: true_vf: V[S] = value_iteration_result(fmdp, gamma)[0] states: Sequence[NonTerminal[S]] = fmdp.non_terminal_states colors: Sequence[str] = ['b', 'g', 'r', 'k', 'c', 'm', 'y'] import matplotlib.pyplot as plt plt.figure(figsize=(11, 7)) if method_mask[0]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): mc_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \ glie_mc_finite_control_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=mc_episode_length_tol ) mc_errors = [] batch_mc_errs = [] for i, mc_qvf in enumerate( itertools.islice(mc_funcs_it, num_episodes)): mc_vf: V[S] = { s: max(mc_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_mc_errs.append( sqrt( sum((mc_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % plot_batch == plot_batch - 1: mc_errors.append(sum(batch_mc_errs) / plot_batch) batch_mc_errs = [] mc_plot = mc_errors[plot_start:] label = f"MC InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(mc_plot)), mc_plot, color=colors[k], linestyle='-', label=label) sample_episodes: int = 1000 uniform_policy: FinitePolicy[S, A] = \ FinitePolicy( {s.state: Choose(fmdp.actions(s)) for s in states} ) fmrp: FiniteMarkovRewardProcess[S] = \ fmdp.apply_finite_policy(uniform_policy) td_episode_length: int = int( round( sum( len( list( returns(trace=fmrp.simulate_reward(Choose(states)), γ=gamma, tolerance=mc_episode_length_tol))) for _ in range(sample_episodes)) / sample_episodes)) if method_mask[1]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): sarsa_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \ glie_sarsa_finite_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=td_episode_length, ) sarsa_errors = [] transitions_batch = plot_batch * td_episode_length batch_sarsa_errs = [] for i, sarsa_qvf in enumerate( itertools.islice(sarsa_funcs_it, num_episodes * td_episode_length)): sarsa_vf: V[S] = { s: max(sarsa_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_sarsa_errs.append( sqrt( sum((sarsa_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % transitions_batch == transitions_batch - 1: sarsa_errors.append( sum(batch_sarsa_errs) / transitions_batch) batch_sarsa_errs = [] sarsa_plot = sarsa_errors[plot_start:] label = f"SARSA InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(sarsa_plot)), sarsa_plot, color=colors[k], linestyle='--', label=label) if method_mask[2]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): ql_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \ q_learning_finite_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon=q_learning_epsilon, max_episode_length=td_episode_length, ) ql_errors = [] transitions_batch = plot_batch * td_episode_length batch_ql_errs = [] for i, ql_qvf in enumerate( itertools.islice(ql_funcs_it, num_episodes * td_episode_length)): ql_vf: V[S] = { s: max(ql_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_ql_errs.append( sqrt( sum((ql_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % transitions_batch == transitions_batch - 1: ql_errors.append(sum(batch_ql_errs) / transitions_batch) batch_ql_errs = [] ql_plot = ql_errors[plot_start:] label = f"Q-Learning InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(ql_plot)), ql_plot, color=colors[k], linestyle=':', label=label) plt.xlabel("Episode Batches", fontsize=20) plt.ylabel("Optimal Value Function RMSE", fontsize=20) plt.title("RMSE as function of episode batches", fontsize=20) plt.grid(True) plt.legend(fontsize=10) plt.show()
def compare_td_and_mc( fmrp: FiniteMarkovRewardProcess[S], gamma: float, mc_episode_length_tol: float, num_episodes: int, learning_rates: Sequence[Tuple[float, float, float]], initial_vf_dict: Mapping[NonTerminal[S], float], plot_batch: int, plot_start: int ) -> None: true_vf: np.ndarray = fmrp.get_value_function_vec(gamma) states: Sequence[NonTerminal[S]] = fmrp.non_terminal_states colors: Sequence[str] = ['r', 'y', 'm', 'g', 'c', 'k', 'b'] import matplotlib.pyplot as plt plt.figure(figsize=(11, 7)) for k, (init_lr, half_life, exponent) in enumerate(learning_rates): mc_funcs_it: Iterator[ValueFunctionApprox[S]] = \ mc_finite_prediction_learning_rate( fmrp=fmrp, gamma=gamma, episode_length_tolerance=mc_episode_length_tol, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, initial_vf_dict=initial_vf_dict ) mc_errors = [] batch_mc_errs = [] for i, mc_f in enumerate(itertools.islice(mc_funcs_it, num_episodes)): batch_mc_errs.append(sqrt(sum( (mc_f(s) - true_vf[j]) ** 2 for j, s in enumerate(states) ) / len(states))) if i % plot_batch == plot_batch - 1: mc_errors.append(sum(batch_mc_errs) / plot_batch) batch_mc_errs = [] mc_plot = mc_errors[plot_start:] label = f"MC InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot( range(len(mc_plot)), mc_plot, color=colors[k], linestyle='-', label=label ) sample_episodes: int = 1000 td_episode_length: int = int(round(sum( len(list(returns( trace=fmrp.simulate_reward(Choose(states)), γ=gamma, tolerance=mc_episode_length_tol ))) for _ in range(sample_episodes) ) / sample_episodes)) for k, (init_lr, half_life, exponent) in enumerate(learning_rates): td_funcs_it: Iterator[ValueFunctionApprox[S]] = \ td_finite_prediction_learning_rate( fmrp=fmrp, gamma=gamma, episode_length=td_episode_length, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, initial_vf_dict=initial_vf_dict ) td_errors = [] transitions_batch = plot_batch * td_episode_length batch_td_errs = [] for i, td_f in enumerate( itertools.islice(td_funcs_it, num_episodes * td_episode_length) ): batch_td_errs.append(sqrt(sum( (td_f(s) - true_vf[j]) ** 2 for j, s in enumerate(states) ) / len(states))) if i % transitions_batch == transitions_batch - 1: td_errors.append(sum(batch_td_errs) / transitions_batch) batch_td_errs = [] td_plot = td_errors[plot_start:] label = f"TD InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot( range(len(td_plot)), td_plot, color=colors[k], linestyle='--', label=label ) plt.xlabel("Episode Batches", fontsize=20) plt.ylabel("Value Function RMSE", fontsize=20) plt.title( "RMSE of MC and TD as function of episode batches", fontsize=25 ) plt.grid(True) plt.legend(fontsize=10) plt.show()