def glie_mc_finite_learning_rate_correctness( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float, num_episodes: int) -> None: qvfs: Iterator[QValueFunctionApprox[S, A]] = \ glie_mc_finite_control_learning_rate( fmdp=fmdp, initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=episode_length_tolerance ) final_qvf: QValueFunctionApprox[S, A] = \ iterate.last(itertools.islice(qvfs, num_episodes)) opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf) print(f"GLIE MC Optimal Value Function with {num_episodes:d} episodes") pprint(opt_vf) print(f"GLIE MC Optimal Policy with {num_episodes:d} episodes") print(opt_policy) true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma) print("True Optimal Value Function") pprint(true_opt_vf) print("True Optimal Policy") print(true_opt_policy)
def get_vi_vf_and_policy(self) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]: ''' Performs the Value Iteration DP algorithm returning the Optimal Value Function (as a V[Cell]) and the Optimal Policy (as a FinitePolicy[Cell, Move]) ''' return value_iteration_result(self.get_finite_mdp(), gamma=1.)
def q_learning_finite_learning_rate_correctness( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon: float, max_episode_length: int, num_updates: int, ) -> None: qvfs: Iterator[QValueFunctionApprox[S, A]] = \ q_learning_finite_learning_rate( fmdp=fmdp, initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent, gamma=gamma, epsilon=epsilon, max_episode_length=max_episode_length ) final_qvf: QValueFunctionApprox[S, A] = \ iterate.last(itertools.islice(qvfs, num_updates)) opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf) print(f"Q-Learning ptimal Value Function with {num_updates:d} updates") pprint(opt_vf) print(f"Q-Learning Optimal Policy with {num_updates:d} updates") print(opt_policy) true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma) print("True Optimal Value Function") pprint(true_opt_vf) print("True Optimal Policy") print(true_opt_policy)
def test_value_iteration(self): mdp_map: Mapping[NonTerminal[InventoryState], float] = value_iteration_result( self.si_mdp, self.gamma)[0] # print(mdp_map) mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states]) fa = Dynamic({s: 0.0 for s in self.states}) mdp_finite_fa = iterate.converged(value_iteration_finite( self.si_mdp, self.gamma, fa), done=lambda a, b: a.within(b, 1e-5)) # print(mdp_finite_fa.values_map) mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.01) mdp_fa = iterate.converged(value_iteration(self.si_mdp, self.gamma, fa, Choose(self.states), num_state_samples=30), done=lambda a, b: a.within(b, 1e-5)) # print(mdp_fa.values_map) mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 0.01)
def process_time(n,gamma = 1) -> Tuple[float,float,float]: print(f"n={n}") model = LilypadModel(n) start = time.time() list_policies = get_policies(n) optimal_policy,list_sum,list_values,idx_max = get_optimal_policy(n,model,list_policies,gamma = gamma) time_brute = time.time()-start start_2 = time.time() value_iter = value_iteration_result(model,1) time_value_iter = time.time() - start_2 start_3 = time.time() policy_iter = policy_iteration_result(model,1) time_policy_iter = time.time() - start_3 return time_brute,time_value_iter,time_policy_iter
def test_value_iteration(self): mdp_map: Mapping[InventoryState, float] = value_iteration_result( self.si_mdp, self.gamma)[0] # print(mdp_map) mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states]) fa = Dynamic({s: 0.0 for s in self.states}) mdp_finite_fa = FunctionApprox.converged( value_iteration_finite(self.si_mdp, self.gamma, fa)) # print(mdp_finite_fa.values_map) mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.001) mdp_fa = FunctionApprox.converged( value_iteration(self.si_mdp, self.gamma, fa, Choose(self.states), num_state_samples=30), 0.1) # print(mdp_fa.values_map) mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 1.0)
for state in si_mdp.non_terminal_states }) mc_tabular_control = mc_control(si_mdp, start_states, Tabular(start_map, start_map), user_gamma, 800) values_map = mc_tabular_control.values_map opt_vf, opt_pi = get_optimal_policy(values_map) print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi) fdp: FinitePolicy[InventoryState, int] = FinitePolicy({ InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta)) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha) }) implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \ si_mdp.apply_finite_policy(fdp) print("MDP Value Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma) print(opt_vf_vi, '\n') print(opt_policy_vi) print("MDP Policy Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_pi, opt_policy_pi = policy_iteration_result(si_mdp, gamma=user_gamma) print(opt_vf_pi, '\n') print(opt_policy_pi)
t2 = time.time() time_brute_force = t2 - t1 y_brute.append(time_brute_force) # Policy Iteration t1 = time.time() opt_vf_pi, opt_policy_pi = policy_iteration_result(frog_mdp, gamma=1) t2 = time.time() time_policy_iter = t2 - t1 y_pi.append(time_policy_iter) #pprint(opt_vf_pi) #print(opt_policy_pi) # Value Iteration t1 = time.time() opt_vf_pi, opt_policy_pi = value_iteration_result(frog_mdp, gamma=1) t2 = time.time() time_value_iter = t2 - t1 y_vi.append(time_value_iter) #pprint(opt_vf_pi) #print(opt_policy_pi) plt.plot(x, y_brute, c='r', label='Brute Force') plt.plot(x, y_pi, c='b', label='Policy Iteration') plt.plot(x, y_vi, c='g', label='Value Iteration') plt.xlabel('Number of Lilypads') plt.ylabel('Time till Convergence') plt.legend() plt.show()
def compare_mc_sarsa_ql(fmdp: FiniteMarkovDecisionProcess[S, A], method_mask: Tuple[bool, bool, bool], learning_rates: Sequence[Tuple[float, float, float]], gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], q_learning_epsilon: float, mc_episode_length_tol: float, num_episodes: int, plot_batch: int, plot_start: int) -> None: true_vf: V[S] = value_iteration_result(fmdp, gamma)[0] states: Sequence[NonTerminal[S]] = fmdp.non_terminal_states colors: Sequence[str] = ['b', 'g', 'r', 'k', 'c', 'm', 'y'] import matplotlib.pyplot as plt plt.figure(figsize=(11, 7)) if method_mask[0]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): mc_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \ glie_mc_finite_control_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=mc_episode_length_tol ) mc_errors = [] batch_mc_errs = [] for i, mc_qvf in enumerate( itertools.islice(mc_funcs_it, num_episodes)): mc_vf: V[S] = { s: max(mc_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_mc_errs.append( sqrt( sum((mc_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % plot_batch == plot_batch - 1: mc_errors.append(sum(batch_mc_errs) / plot_batch) batch_mc_errs = [] mc_plot = mc_errors[plot_start:] label = f"MC InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(mc_plot)), mc_plot, color=colors[k], linestyle='-', label=label) sample_episodes: int = 1000 uniform_policy: FinitePolicy[S, A] = \ FinitePolicy( {s.state: Choose(fmdp.actions(s)) for s in states} ) fmrp: FiniteMarkovRewardProcess[S] = \ fmdp.apply_finite_policy(uniform_policy) td_episode_length: int = int( round( sum( len( list( returns(trace=fmrp.simulate_reward(Choose(states)), γ=gamma, tolerance=mc_episode_length_tol))) for _ in range(sample_episodes)) / sample_episodes)) if method_mask[1]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): sarsa_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \ glie_sarsa_finite_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=td_episode_length, ) sarsa_errors = [] transitions_batch = plot_batch * td_episode_length batch_sarsa_errs = [] for i, sarsa_qvf in enumerate( itertools.islice(sarsa_funcs_it, num_episodes * td_episode_length)): sarsa_vf: V[S] = { s: max(sarsa_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_sarsa_errs.append( sqrt( sum((sarsa_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % transitions_batch == transitions_batch - 1: sarsa_errors.append( sum(batch_sarsa_errs) / transitions_batch) batch_sarsa_errs = [] sarsa_plot = sarsa_errors[plot_start:] label = f"SARSA InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(sarsa_plot)), sarsa_plot, color=colors[k], linestyle='--', label=label) if method_mask[2]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): ql_funcs_it: Iterator[QValueFunctionApprox[S, A]] = \ q_learning_finite_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon=q_learning_epsilon, max_episode_length=td_episode_length, ) ql_errors = [] transitions_batch = plot_batch * td_episode_length batch_ql_errs = [] for i, ql_qvf in enumerate( itertools.islice(ql_funcs_it, num_episodes * td_episode_length)): ql_vf: V[S] = { s: max(ql_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_ql_errs.append( sqrt( sum((ql_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % transitions_batch == transitions_batch - 1: ql_errors.append(sum(batch_ql_errs) / transitions_batch) batch_ql_errs = [] ql_plot = ql_errors[plot_start:] label = f"Q-Learning InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(ql_plot)), ql_plot, color=colors[k], linestyle=':', label=label) plt.xlabel("Episode Batches", fontsize=20) plt.ylabel("Optimal Value Function RMSE", fontsize=20) plt.title("RMSE as function of episode batches", fontsize=20) plt.grid(True) plt.legend(fontsize=10) plt.show()
user_stockout_cost2 = 15.0 store1: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) store2: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity2, poisson_lambda=user_poisson_lambda2, holding_cost=user_holding_cost2, stockout_cost=user_stockout_cost2 ) K1 = 1 K2 = 1 problem4 = ComplexMDP(store1 = store1, store2 = store2, K1 = K1, K2 = K2 ) value_opt = value_iteration_result(problem4,user_gamma) policy_opt = policy_iteration_result(problem4,user_gamma)
b = next(v, None) if max(abs(a[s] - b[s]) for s in a) < TOLERANCE: break a = b count += 1 opt_policy: FinitePolicy[Coordinate, Action] = greedy_policy_from_vf( model, b, gamma) return count, b, opt_policy start = time.time() count1, opt_vf1, opt_pol1 = solution(model1, 0.8) print(f"Method 1 took {time.time()-start} to converge") start = time.time() count2, opt_vf2, opt_pol2 = solution(model2, 1) print(f"Method 2 took {time.time()-start} to converge") print(f"Solution 1 took {count1} iterations to converge") print(f"Solution 2 took {count2} iterations to converge") print(opt_pol1) print(opt_pol2) #This is a fast solution where we don't track #the number of iterations to converge #We're using a built-in function of rl.dynamic_programming here start = time.time() opt_vf1, opt_pol1 = value_iteration_result(model1, 0.8) print(f"Method 1 took {time.time()-start} to converge") start = time.time() opt_vf2, opt_pol2 = value_iteration_result(model2, 1) print(f"Method 2 took {time.time()-start} to converge")
q_learning_experience_replay( mdp=si_mdp, policy_from_q=lambda f, m: epsilon_greedy_policy( q=f, mdp=m, ϵ=epsilon ), states=Choose(si_mdp.non_terminal_states), approx_0=Tabular( count_to_weight_func=learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=learning_rate_half_life, exponent=learning_rate_exponent ) ), γ=gamma, max_episode_length=episode_length, mini_batch_size=mini_batch_size, weights_decay_half_life=time_decay_half_life ) qvf: QValueFunctionApprox[InventoryState, int] = iterate.last( itertools.islice(q_iter, num_updates)) vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf) pprint(vf) print(pol) true_vf, true_pol = value_iteration_result(mdp=si_mdp, gamma=gamma) pprint(true_vf) print(true_pol)
def vi_vf_and_policy(self) -> \ Tuple[V[int], FiniteDeterministicPolicy[int, int]]: return value_iteration_result(self, 1.0)
self.W - state.wage - 1, pois_mean) submapping[action] = Categorical(dic_distrib) mapping[state] = submapping return mapping if __name__ == '__main__': H = 10 W = 30 alpha = 0.08 beta = 0.82 gamma = 0.95 print("Defining the model") model = Problem3(H, W, alpha, beta) print("Value iteration algorithm") opt_val, opt_pol = value_iteration_result(model, gamma) print(opt_pol) """ if state.wage == self.W: for action in list_actions: #If you're in state W, you stay in state W with constant #Probability. The reward only depends on the action you #you have chosen submapping[action] = Constant((State(state.wage), state.wage*\ (self.H-action.l-action.s))) elif state.wage == self.W-1: for action in list_actions: s:int = action.s l:int = action.l #If you're in state W-1, you can either stay in your state
if __name__ == '__main__': import matplotlib.pyplot as plt from pprint import pprint hours: int = 10 wage_cap: int = 30 alpha: float = 0.08 beta: float = 0.82 gamma: float = 0.95 co: CareerOptimization = CareerOptimization(hours=hours, wage_cap=wage_cap, alpha=alpha, beta=beta) _, opt_policy = value_iteration_result(co, gamma=gamma) wages: Iterable[int] = range(1, co.wage_cap + 1) opt_actions: Mapping[int, Tuple[int, int]] = \ {w: opt_policy.act(w).value for w in wages} searching: Sequence[int] = [s for _, (s, _) in opt_actions.items()] learning: Sequence[int] = [l for _, (_, l) in opt_actions.items()] working: Sequence[int] = [ co.hours - s - l for _, (s, l) in opt_actions.items() ] pprint(opt_actions) plt.xticks(wages) p1 = plt.bar(wages, searching, color='red') p2 = plt.bar(wages, learning, color='blue') p3 = plt.bar(wages, working, color='green') plt.legend((p1[0], p2[0], p3[0]), ('Job-Searching', 'Learning', 'Working')) plt.grid(axis='y')