def glie_mc_finite_control_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float = 1e-5 ) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return mc.glie_mc_control( mdp=fmdp, states=Choose(fmdp.non_terminal_states), approx_0=Tabular(values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=episode_length_tolerance)
def td_prediction(transitions: Iterable[mp.TransitionStep[S]], count_to_weight_func: Callable[[int], float], gamma: float, max_steps: int = 5000) -> Tabular[S]: """ Similar as Monte Carlo Scratch except replacing return y with R_{t+1} + gamma*V(S_{t+1}) for updates """ values_map: Dict[S, float] = {} counts_map: Dict[S, int] = {} count = 0 diff = {} # dict: state and its value error for transition in transitions: if count < max_steps: state = transition.state if state not in diff: diff[state] = 100 counts_map[state] = counts_map.get(state, 0) + 1 weight: float = count_to_weight_func(counts_map.get(state, 0)) if transition.next_state not in values_map: values_map[transition.next_state] = -30 y = transition.reward + gamma * values_map[transition.next_state] diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state]) values_map[state] = weight * y + (1 - weight) * values_map.get( state, 0.) count += 1 elif count >= max_steps or diff[max( diff.items(), key=operator.itemgetter(1))[0]] < 1e-4: print(diff[max(diff.items(), key=operator.itemgetter(1))[0]]) break return Tabular(values_map, counts_map, count_to_weight_func)
def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[bool, bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.states() for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1, ) uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({ s: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.states() }) transitions: Iterable[mdp.TransitionStep[ bool, bool]] = self.finite_mdp.simulate_actions( Choose(self.finite_mdp.states()), uniform_policy) qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99) q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last( cast(Iterator[Tabular[Tuple[bool, bool]]], itertools.islice(qs, 20000))) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [True, False]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False
def td_lambda_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, lambd: float, episode_length: int, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[NonTerminal[S], float] ) -> Iterator[ValueFunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) curtailed_episodes: Iterable[Iterable[TransitionStep[S]]] = \ (itertools.islice(episode, episode_length) for episode in episodes) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) return td_lambda.td_lambda_prediction( traces=curtailed_episodes, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, lambd=lambd )
def test_evaluate_finite_mrp(self) -> None: start = Tabular( {s: 0.0 for s in self.finite_flip_flop.states()}, count_to_weight_func=lambda _: 0.1, ) episode_length = 20 episodes: Iterable[Iterable[ mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces( Choose({True, False})) transitions: Iterable[ mp.TransitionStep[bool]] = itertools.chain.from_iterable( itertools.islice(episode, episode_length) for episode in episodes) vs = td.td_prediction(transitions, γ=0.99, approx_0=start) v: Optional[Tabular[bool]] = iterate.last( itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000)) if v is not None: self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 3.0) else: assert False
def td_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, episode_length: int, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[S, float], ) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = fmrp_episodes_stream(fmrp) td_experiences: Iterable[TransitionStep[S]] = unit_experiences_from_episodes( episodes, episode_length ) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent, ) return td.td_prediction( transitions=td_experiences, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, )
def test_evaluate_mrp(self): vf = evaluate(self.mrp_seq, 1.) states = self.single_step_mrp.states() fa_dynamic = Dynamic({s: 0.0 for s in states}) fa_tabular = Tabular() distribution = Choose(set(states)) approx_vf_finite = backward_evaluate_finite( [(self.mrp_seq[i], fa_dynamic) for i in range(self.steps)], 1. ) approx_vf = backward_evaluate( [(self.single_step_mrp, fa_tabular, distribution) for _ in range(self.steps)], 1., num_state_samples=120, error_tolerance=0.01 ) for t, (v1, v2, v3) in enumerate(zip( vf, approx_vf_finite, approx_vf )): states = self.mrp_seq[t].keys() v1_arr = np.array([v1[s] for s in states]) v2_arr = v2.evaluate(states) v3_arr = v3.evaluate(states) self.assertLess(max(abs(v1_arr - v2_arr)), 0.001) self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
def test_value_iteration(self): vpstar = optimal_vf_and_policy(self.mdp_seq, 1.) states = self.single_step_mdp.states() fa_dynamic = Dynamic({s: 0.0 for s in states}) fa_tabular = Tabular() distribution = Choose(set(states)) approx_vpstar_finite = back_opt_vf_and_policy_finite( [(self.mdp_seq[i], fa_dynamic) for i in range(self.steps)], 1. ) approx_vpstar = back_opt_vf_and_policy( [(self.single_step_mdp, fa_tabular, distribution) for _ in range(self.steps)], 1., num_state_samples=120, error_tolerance=0.01 ) for t, ((v1, _), (v2, _), (v3, _)) in enumerate(zip( vpstar, approx_vpstar_finite, approx_vpstar )): states = self.mdp_seq[t].keys() v1_arr = np.array([v1[s] for s in states]) v2_arr = v2.evaluate(states) v3_arr = v3.evaluate(states) self.assertLess(max(abs(v1_arr - v2_arr)), 0.001) self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
def mc_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, episode_length_tolerance: float, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[NonTerminal[S], float] ) -> Iterator[ValueFunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) return mc.mc_prediction( traces=episodes, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func ), γ=gamma, episode_length_tolerance=episode_length_tolerance )
def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]], gamma: float, num_episodes: int) -> Mapping[S, float]: return iterate.last( itertools.islice( mc.mc_prediction(traces=episodes_stream, approx_0=Tabular(), γ=gamma, tolerance=1e-10), num_episodes)).values_map
def mc_finite_prediction_equal_wts( fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float, initial_vf_dict: Mapping[S, float]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) return mc.mc_prediction(traces=episodes, approx_0=Tabular(values_map=initial_vf_dict), γ=gamma, tolerance=tolerance)
def mc_prediction(transitions: Iterable[mp.TransitionStep[S]], count_to_weight_func: Callable[[int], float], gamma: float, tolerance: float = 1e-200) -> Tabular[S]: ''' Returns the approximated value function after each episode. Approximates Tabular MC Prediction with a discrete domain of states S, without any interpolation. The value function for each S is maintained as a weighted mean of observations by recency (managed by `count_to_weight_func'). In practice, this means you can use this to approximate a function with a learning rate α(n) specified by count_to_weight_func. Fields: values_map -- mapping from S to its approximated value function counts_map -- how many times a given S has been updated count_to_weight_func -- function for how much to weigh an update to S based on the number of times that S has been updated Update the value approximation with the given points. ''' values_map: Dict[S, float] = {} counts_map: Dict[S, int] = {} trace = [] count = 0 diff = {} max_steps = round(math.log(tolerance) / math.log(gamma)) print('max steps: ', max_steps) # get trace for transition in transitions: trace.append(transition) count += 1 if count >= max_steps: break # get corresponding return transitions_returns = returns(trace, gamma, tolerance) trace_returns = [return_ for return_ in transitions_returns] for i in range(len(trace)): # x: state; y: return for first n occurrences of x x = trace[i].state y = trace_returns[i].return_ if x not in diff: diff[x] = 100 diff[x] = min(abs(y - values_map.get(x, 0.)), diff[x]) if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 1e-4: break counts_map[x] = counts_map.get(x, 0) + 1 weight: float = count_to_weight_func(counts_map.get(x, 0)) values_map[x] = weight * y + (1 - weight) * values_map.get(x, 0.) print(diff[max(diff.items(), key=operator.itemgetter(1))[0]]) return Tabular(values_map, counts_map, count_to_weight_func)
def td_prediction(experiences_stream: Iterator[TransitionStep[S]], gamma: float, num_experiences: int) -> Mapping[S, float]: return iterate.last( itertools.islice( td.td_prediction( transitions=experiences_stream, approx_0=Tabular(count_to_weight_func=learning_rate_schedule( initial_learning_rate=0.01, half_life=10000, exponent=0.5)), γ=gamma), num_experiences)).values_map
def td_lambda_tabular_prediction( transitions: Iterable[mp.TransitionStep[S]], count_to_weight_func: Callable[[int], float], gamma: float, lambd: float, max_steps: int = 2000, tolerance: float = 1e-200) -> Tuple[Tabular[S], int]: """ Similar to TD Scratch except replacing use G_{t,n} for updates """ values_map: Dict[S, float] = {} counts_map: Dict[S, int] = {} trace = [] count = 0 diff = {} # dict: state and its value error for transition in transitions: count += 1 trace.append(transition) if count > max_steps: break # get corresponding return transitions_returns = returns(trace, gamma, tolerance) trace_returns = [return_ for return_ in transitions_returns] for i in range(max_steps): transition = trace[i] state = transition.state if state not in diff: diff[state] = 100 counts_map[state] = counts_map.get(state, 0) + 1 weight: float = count_to_weight_func(counts_map.get(state, 0)) if transition.next_state not in values_map: values_map[transition.next_state] = -30 y = lambd**(max_steps - i - 1) * trace_returns[i].return_ if lambd == 0: y = 0 for n in range(1, max_steps - i): g_tn = 0 for j in range(i, i + n): next_transition = trace[j] g_tn += gamma**(j - i) * next_transition.reward if j == i + n - 1: g_tn += gamma**n * values_map.get( next_transition.next_state, 0) y += (1 - lambd) * lambd**(n - 1) * g_tn diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state]) values_map[state] = weight * y + (1 - weight) * values_map.get( state, 0.) # print(y, values_map[state]) count += 1 if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 0.1: break print(diff[max(diff.items(), key=operator.itemgetter(1))[0]]) return Tabular(values_map, counts_map, count_to_weight_func), i
def test_evaluate_finite_mrp(self): start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()}) traces = self.finite_flip_flop.reward_traces(Choose({True, False})) v = iterate.converged( mc.evaluate_mrp(traces, γ=0.99, approx_0=start), # Loose bound of 0.025 to speed up test. done=lambda a, b: a.within(b, 0.025)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 1.0)
def glie_mc_finite_control_equal_wts( fmdp: FiniteMarkovDecisionProcess[S, A], gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float = 1e-5, ) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} return mc.glie_mc_control( mdp=fmdp, states=Choose(fmdp.non_terminal_states), approx_0=Tabular(values_map=initial_qvf_dict), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=episode_length_tolerance)
def td_nbootstrap_tabular_prediction(transitions: Iterable[ mp.TransitionStep[S]], count_to_weight_func: Callable[[int], float], gamma: float, n: int, max_steps: int = 5000, tolerance: float = 1e-10) -> Tabular[S]: """ Similar to TD Scratch except replacing use G_{t,n} for updates """ values_map: Dict[S, float] = {} counts_map: Dict[S, int] = {} trace = [] count = 0 diff = {} # dict: state and its value error for transition in transitions: count += 1 trace.append(transition) if count > max_steps + n: break for i in range(max_steps): transition = trace[i] state = transition.state if state not in diff: diff[state] = 100 counts_map[state] = counts_map.get(state, 0) + 1 weight: float = count_to_weight_func(counts_map.get(state, 0)) if transition.next_state not in values_map: values_map[transition.next_state] = -10 y = transition.reward for j in range(i + 1, i + n): next_transition = trace[j] y += gamma**(j - i) * next_transition.reward if j == i + n - 1: y += gamma**n * values_map.get(next_transition.next_state, 0) diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state]) values_map[state] = weight * y + (1 - weight) * values_map.get( state, 0.) count += 1 if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 1e-4: break print(diff[max(diff.items(), key=operator.itemgetter(1))[0]]) return Tabular(values_map, counts_map, count_to_weight_func)
def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.non_terminal_states for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1 ) uniform_policy: FinitePolicy[bool, bool] =\ FinitePolicy({ s.state: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.non_terminal_states }) transitions: Iterable[mdp.TransitionStep[bool, bool]] =\ self.finite_mdp.simulate_actions( Choose(self.finite_mdp.non_terminal_states), uniform_policy ) qs = td.q_learning_external_transitions( transitions, self.finite_mdp.actions, q_0, γ=0.99 ) q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\ iterate.last( cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]], itertools.islice(qs, 20000)) ) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [NonTerminal(True), NonTerminal(False)]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False
def glie_sarsa_finite_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], max_episode_length: int) -> Iterator[FunctionApprox[Tuple[S, A]]]: initial_qvf_dict: Mapping[Tuple[S, A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return td.glie_sarsa(mdp=fmdp, states=Choose(set(fmdp.non_terminal_states)), approx_0=Tabular( values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=max_episode_length)
def q_learning_finite_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon: float, max_episode_length: int) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return td.q_learning(mdp=fmdp, policy_from_q=lambda f, m: mc.epsilon_greedy_policy( q=f, mdp=m, ϵ=epsilon), states=Choose(fmdp.non_terminal_states), approx_0=Tabular( values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, max_episode_length=max_episode_length)
def example_model_data_generator() -> Iterator[DataSeq]: coeffs: Aug_Triple = (2., 10., 4., -6.) values = np.linspace(-10.0, 10.0, 21) pts: Sequence[Triple] = [(x, y, z) for x in values for y in values for z in values] d = norm(loc=0., scale=2.0) while True: res: List[Tuple[Triple, float]] = [] for pt in pts: x_val: Triple = (pt[0], pt[1], pt[2]) y_val: float = coeffs[0] + np.dot(coeffs[1:], pt) + \ d.rvs(size=1)[0] res.append((x_val, y_val)) yield res if __name__ == '__main__': training_iterations: int = 30 data_gen: Iterator[DataSeq] = example_model_data_generator() test_data: DataSeq = list(next(data_gen)) tabular: Tabular[Triple] = Tabular() for xy_seq in islice(data_gen, training_iterations): tabular = tabular.update(xy_seq) this_rmse: float = tabular.rmse(test_data) print(f"RMSE = {this_rmse:.3f}")
poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost) # initialize values_map and count_maps for Tabular start_map = {} for state in si_mdp.mapping.keys(): for action in si_mdp.actions(state): start_map[(state, action)] = 0 # start state distribution: every non-terminal state has equal probability to be the start state start_states = Categorical({ state: 1 / len(si_mdp.non_terminal_states) for state in si_mdp.non_terminal_states }) mc_tabular_control = mc_control(si_mdp, start_states, Tabular(start_map, start_map), user_gamma, 800) values_map = mc_tabular_control.values_map opt_vf, opt_pi = get_optimal_policy(values_map) print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi) fdp: FinitePolicy[InventoryState, int] = FinitePolicy({ InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta)) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha) }) implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \ si_mdp.apply_finite_policy(fdp) print("MDP Value Iteration Optimal Value Function and Optimal Policy") print("--------------")
si_mrp = SimpleInventoryMRPFinite( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) print("Value Function") print("--------------") si_mrp.display_value_function(gamma=user_gamma) print() states:List[InventoryState] = si_mrp.non_terminal_states start_state_distrib: Categorical[InventoryState] = Categorical({i:1 for i in states}) simulation_episodes = si_mrp.reward_traces(start_state_distrib) simulation_transitions = si_mrp.simulate_reward(start_state_distrib) approx_0 = Tabular({i : 0 for i in states}) value_mc = mc_prediction_scratch( traces = simulation_episodes, states = states, γ = user_gamma, tolerance = 1e-6, num_episodes = 10000 ) print("Value Function with our implementation of MC") print(value_mc) value_mc_other = mc_prediction( traces = simulation_episodes, approx_0 = approx_0, γ = user_gamma )
initial_vf_dict: Mapping[NonTerminal[InventoryState], float] = \ {s: 0. for s in si_mrp.non_terminal_states} gamma: float = 0.9 lambda_param = 0.3 num_episodes = 10000 episode_length: int = 100 initial_learning_rate: float = 0.03 half_life: float = 1000.0 exponent: float = 0.5 approx_0: Tabular[NonTerminal[InventoryState]] = Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent ) ) episodes: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(si_mrp.non_terminal_states)) traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ (itertools.islice(episode, episode_length) for episode in episodes) vf_iter: Iterator[Tabular[NonTerminal[InventoryState]]] = \ lambda_return_prediction( traces=traces, approx_0=approx_0, γ=gamma, lambd=lambda_param
time_decay_half_life: float = 3000 num_updates: int = 10000 q_iter: Iterator[QValueFunctionApprox[InventoryState, int]] = \ q_learning_experience_replay( mdp=si_mdp, policy_from_q=lambda f, m: epsilon_greedy_policy( q=f, mdp=m, ϵ=epsilon ), states=Choose(si_mdp.non_terminal_states), approx_0=Tabular( count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=learning_rate_half_life, exponent=learning_rate_exponent ) ), γ=gamma, max_episode_length=episode_length, mini_batch_size=mini_batch_size, weights_decay_half_life=time_decay_half_life ) qvf: QValueFunctionApprox[InventoryState, int] = iterate.last( itertools.islice(q_iter, num_updates)) vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf) pprint(vf) print(pol)
nt_states: Sequence[NonTerminal[int]] = random_walk.non_terminal_states start_distribution: NTStateDistribution[int] = Choose(nt_states) traces: Iterable[Iterable[TransitionStep[int]]] = \ random_walk.reward_traces(start_distribution) transitions: Iterable[TransitionStep[int]] = \ itertools.chain.from_iterable(traces) td_transitions: Iterable[TransitionStep[int]] = \ itertools.islice(transitions, num_transitions) initial_learning_rate: float = 0.5 half_life: float = 1000 exponent: float = 0.5 approx0: Tabular[NonTerminal[int]] = Tabular( count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent)) td_func: Tabular[NonTerminal[int]] = \ iterate.last(itertools.islice( td_prediction( transitions=td_transitions, approx_0=approx0, γ=gamma ), num_transitions )) td_vf: np.ndarray = td_func.evaluate(nt_states) num_polynomials: int = 5
transition_map = si_mdp.get_action_transition_reward_map() # fdp: markov_decision_process.FinitePolicy[InventoryState, int] = markov_decision_process.FinitePolicy( # {InventoryState(alpha, beta): # Constant(user_capacity - (alpha + beta)) for alpha in # range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)} # ) # initialize values_map and count_maps for Tabular start_map = {} state_action = {} for state in si_mdp.mapping.keys(): state_action[state] = [] for action in si_mdp.actions(state): start_map[(state, action)] = 0 state_action[state].append(action) q = Tabular(start_map, start_map) start_states = Categorical({ state: 1 / len(si_mdp.non_terminal_states) for state in si_mdp.non_terminal_states }) # transitions = si_mdp.simulate_actions(start_states, fdp) sarsa_tabular_control = sarsa_control(start_states, transition_map, state_action, q, user_gamma, 0.1) diff = {} prev = q.values_map count = 0 for fcn_approx in sarsa_tabular_control: next = fcn_approx.values_map print(fcn_approx.values_map) count += 1
[(x, f(x) + n.rvs(size=1)[0]) for x in x_pts] ff = lambda x: x BSApprox = BSplineApprox(feature_function=ff, degree=3) solved = BSApprox.solve(xy_vals_seq) errors: np.ndarray = solved.evaluate(x_pts) - np.array( [y for _, y in xy_vals_seq]) print("Mean Squared Error") print(np.mean(errors * errors)) print("Indirect Solve") BSApprox_ind = BSplineApprox(feature_function=ff, degree=3, knots=np.array([0, 1, 2, 3]), direct_solve=False) solved_ind = BSApprox_ind.solve(xy_vals_seq) errors: np.ndarray = solved_ind.evaluate(x_pts) - np.array( [y for _, y in xy_vals_seq]) print("Mean Squared Error") print(np.mean(errors * errors)) #The second method takes way more time to converge #PROBLEM 2 print("Solving Problem 2") n = 10 model = LilypadModel(n) approx0 = Tabular(values_map={s: 0.0 for s in model.non_terminal_states}) result = approx_policy_iteration_result(model, 0.9, approx0) #Results consistent with what we had before
itertools.islice(trace, episode_length) for trace in traces ) num_episodes = 100000 print("Value Function (TD Function Approximation)") print("--------------") initial_learning_rate: float = 0.03 half_life: float = 1000.0 exponent: float = 0.5 learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp( transitions=unit_experiences_accumulated, approx_0=Tabular(count_to_weight_func=learning_rate_func), γ=user_gamma) final_td_vf: FunctionApprox[InventoryState] = \ last(itertools.islice(td_vfs, episode_length * num_episodes)) pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states}) print() print("Value Function (Tabular MC from scratch)") print("--------------") td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt( transitions=unit_experiences_accumulated, vf={s: 0 for s in si_mrp.non_terminal_states}, γ=user_gamma) final_td_vf: Dict[InventoryState, float] = \ last(itertools.islice(td_vfs, episode_length * num_episodes))
si_mrp = SimpleInventoryMRPFinite(capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost) print("Value Function (Exact)") print("--------------") si_mrp.display_value_function(gamma=user_gamma) print() print("Value Function (MC Function Approximation)") print("--------------") traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states))) it: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp( traces=traces, approx_0=Tabular(), γ=user_gamma) num_traces = 10000 last_vf_mc: FunctionApprox[InventoryState] = last(islice(it, num_traces)) pprint({ s: round(last_vf_mc.evaluate([s])[0], 3) for s in si_mrp.non_terminal_states }) print() print("Value Function (Tabular MC from scratch)") print("--------------") traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states))) it: Iterator[Dict[InventoryState, float]] = evaluate_mrp_mc( traces=traces, vf={s: 0