def sarsa_control_scratch( #traces: Iterable[Iterable[mp.TransitionStep[S]]], mdp_to_sample: FiniteMarkovDecisionProcess, states: List[S], actions: Mapping[S, List[A]], γ: float, num_episodes: float = 10000, eps: float = 0.1, base_lr: float = 0.03, half_life: float = 1000.0, exponent: float = 0.5) -> Mapping[S, float]: q: Mapping[Tuple[S, A], float] = {} counts_per_state_act: Mapping[Tuple[S, A], int] = {} for state in states: for action in actions[state]: q[(state, action)] = 0. counts_per_state_act[(state, action)] = 0 policy_map: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: policy_map[state] = None else: policy_map[state] = Categorical( {action: 1 for action in actions[state]}) Pi: FinitePolicy[S, A] = FinitePolicy(policy_map) state = Categorical({state: 1 for state in states}).sample() for i in range(num_episodes): action_distribution = Pi.act(state) action = action_distribution.sample() next_distribution = mdp_to_sample.step(state, action) next_state, reward = next_distribution.sample() next_action = Pi.act(next_state).sample() counts_per_state_act[(state, action)] += 1 alpha = base_lr / (1 + ( (counts_per_state_act[(state, action)] - 1) / half_life)**exponent) #We choose the next action based on epsilon greedy policy q[(state, action)] += alpha * (reward + γ * q[(next_state, next_action)] - q[(state, action)]) new_pol: Mapping[S, Optional[Categorical[A]]] = Pi.policy_map if actions[state] is None: new_pol[state] = None policy_map = { action: eps / len(actions[state]) for action in actions[state] } best_action = actions[state][0] for action in actions[state]: if q[(state, best_action)] <= q[(state, action)]: best_action = action policy_map[best_action] += 1 - eps new_pol[state] = Categorical(policy_map) Pi = FinitePolicy(new_pol) state = next_state if next_state is None: state = Categorical({state: 1 for state in states}).sample() return q
def mc_control_scratch( #traces: Iterable[Iterable[mp.TransitionStep[S]]], mdp_to_sample: FiniteMarkovDecisionProcess, states: List[S], actions: Mapping[S, List[A]], γ: float, tolerance: float = 1e-6, num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]: q: Mapping[Tuple[S, A], float] = {} counts_per_state_act: Mapping[Tuple[S, A], int] = {} for state in states: for action in actions[state]: q[(state, action)] = 0. counts_per_state_act[(state, action)] = 0 policy_map: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: policy_map[state] = None else: policy_map[state] = Categorical( {action: 1 for action in actions[state]}) Pi: FinitePolicy[S, A] = FinitePolicy(policy_map) start_state_distrib = Categorical({state: 1 for state in states}) for i in range(num_episodes): trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions( start_state_distrib, Pi) episode = returns(trace, γ, tolerance) #print(episode) for step in episode: state = step.state action = step.action return_ = step.return_ counts_per_state_act[(state, action)] += 1 q[(state, action)] += 1 / counts_per_state_act[ (state, action)] * (return_ - q[(state, action)]) eps = 1 / (i + 1) new_pol: Mapping[S, Optional[Categorical[A]]] = {} for state in states: if actions[state] is None: new_pol[state] = None policy_map = { action: eps / len(actions[state]) for action in actions[state] } best_action = actions[state][0] for action in actions[state]: if q[(state, best_action)] <= q[(state, action)]: best_action = action policy_map[best_action] += 1 - eps new_pol[state] = Categorical(policy_map) Pi = FinitePolicy(new_pol) return q
def policy_iteration( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, matrix_method_for_mrp_eval: bool = False ) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]: '''Calculate the value function (V*) of the given MDP by improving the policy repeatedly after evaluating the value function for a policy ''' def update(vf_policy: Tuple[V[S], FinitePolicy[S, A]])\ -> Tuple[V[S], FinitePolicy[S, A]]: vf, pi = vf_policy mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) policy_vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate(mrp.get_value_function_vec(gamma))}\ if matrix_method_for_mrp_eval else evaluate_mrp_result(mrp, gamma) improved_pi: FinitePolicy[S, A] = greedy_policy_from_vf( mdp, policy_vf, gamma) return policy_vf, improved_pi v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states} pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}) return iterate(update, (v_0, pi_0))
def get_q_learning_vf_and_policy( self, states_actions_dict: Mapping[Cell, Optional[Set[Move]]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01, epsilon: float = 0.1) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-block cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items() if actions is not None} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): state: Cell = uniform_states.sample() ''' write your code here update the dictionary q initialized above according to the Q-learning algorithm's Q-Value Function updates. ''' vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()} policy: FinitePolicy[Cell, Move] = FinitePolicy({ s: Constant(max(d.items(), key=itemgetter(1))[0]) for s, d in q.items() }) return (vf_dict, policy)
def main(num_pads): # 2^(num_pads-2) deterministic policies fc_mdp: FiniteMarkovDecisionProcess[FrogState, Any] = FrogCroak(num_pads + 1) all_fp = list(itertools.product(['A', 'B'], repeat=fc_mdp.num_pads - 2)) all_mrp_value = [] for fp in all_fp: fdp: FinitePolicy[FrogState, Any] = FinitePolicy( {FrogState(i + 1): Constant(fp[i]) for i in range(len(fp))}) implied_mrp: FiniteMarkovRewardProcess[ FrogState] = fc_mdp.apply_finite_policy(fdp) all_mrp_value.append(implied_mrp.get_value_function_vec(1)) # find the optimized policy max_indices = [] value_matrix = np.array(all_mrp_value) for i in range(num_pads - 1): max_indices.append(np.argmax(value_matrix[:, i])) max_index = list(set(max_indices))[0] print(value_matrix[max_index, :]) print(all_fp[max_index]) plt.plot([ 'State' + str(i + 1) + ',' + all_fp[max_index][i] for i in range(num_pads - 1) ], value_matrix[max_index, :], 'o') plt.xlabel('Frog State') plt.ylabel('Probability') plt.title('n = ' + str(num_pads - 1)) plt.show()
def setUp(self): user_capacity = 2 user_poisson_lambda = 1.0 user_holding_cost = 1.0 user_stockout_cost = 10.0 self.gamma = 0.9 self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) self.fdp: FinitePolicy[InventoryState, int] = FinitePolicy({ InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta)) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha) }) self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\ self.si_mdp.apply_finite_policy(self.fdp) self.states: Sequence[InventoryState] = \ self.implied_mrp.non_terminal_states
def setUp(self): ii = 12 self.steps = 8 pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)] self.cp: ClearancePricingMDP = ClearancePricingMDP( initial_inventory=ii, time_steps=self.steps, price_lambda_pairs=pairs) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FinitePolicy[int, int] = FinitePolicy( {s: Constant(policy_func(s)) for s in range(ii + 1)}) self.single_step_mrp: FiniteMarkovRewardProcess[ int] = self.cp.single_step_mdp.apply_finite_policy( stationary_policy) self.mrp_seq = unwrap_finite_horizon_MRP( finite_horizon_MRP(self.single_step_mrp, self.steps)) self.single_step_mdp: FiniteMarkovDecisionProcess[ int, int] = self.cp.single_step_mdp self.mdp_seq = unwrap_finite_horizon_MDP( finite_horizon_MDP(self.single_step_mdp, self.steps))
def optimal_vf_and_policy( steps: Sequence[StateActionMapping[S, A]], gamma: float) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]: """Use backwards induction to find the optimal value function and optimal policy at each time step """ v_p: List[Tuple[Dict[S, float], FinitePolicy[S, A]]] = [] for step in reversed(steps): this_v: Dict[S, float] = {} this_a: Dict[S, FiniteDistribution[A]] = {} for s, actions_map in step.items(): if actions_map is not None: action_values = (( res.expectation(lambda s_r: s_r[1] + gamma * ( v_p[-1][0][s_r[0]] if len(v_p) > 0 and s_r[0] in v_p[-1][0] else 0.0)), a, ) for a, res in actions_map.items()) v_star, a_star = max(action_values, key=itemgetter(0)) this_v[s] = v_star this_a[s] = Constant(a_star) v_p.append((this_v, FinitePolicy(this_a))) return reversed(v_p)
def get_all_deterministic_policies(self) -> Sequence[FinitePolicy[LilypadState, str]]: bin_to_act = {'0':'A', '1':'B'} all_action_comb = self.get_all_action_combinations() all_policies = [] for action_comb in all_action_comb: policy: FinitePolicy[LilypadState,str] = FinitePolicy( {LilypadState(i+1): Constant(bin_to_act[a]) for i,a in enumerate(action_comb)} ) all_policies.append(policy) return all_policies
def get_policies(n)->Iterable[FinitePolicy[StatePond,Action]]: list_policies: Iterable[FinitePolicy[StatePond,Action]] = [] liste_actions:list = list(itertools.product(['A','B'],repeat=n-1)) for i in liste_actions: policy_map: Mapping[StatePond, Optional[FiniteDistribution[Action]]] = {} policy_map[StatePond(0)] = None policy_map[StatePond(n)] = None for j in range(0,n-1): policy_map[StatePond(j+1)] = Constant(Action(i[j])) list_policies+=[FinitePolicy(policy_map)] return list_policies
def get_opt_vf_from_q(q_value:Mapping[Tuple[S,A],float])\ ->Tuple[Mapping[S,float],FinitePolicy[S,A]]: v: Mapping[S, float] = {} policy_map: Mapping[S, Optional[Constant[A]]] = {} for i in q_value: state, action = i if state not in v.keys() or q_value[i] > v[state]: v[state] = q_value[i] policy_map[state] = Constant(action) Pi = FinitePolicy(policy_map) return (v, Pi)
def get_vf_and_policy_from_qvf( mdp: FiniteMarkovDecisionProcess[S, A], qvf: FunctionApprox[Tuple[S, A]]) -> Tuple[V[S], FinitePolicy[S, A]]: opt_vf: V[S] = { s: max(qvf((s, a)) for a in mdp.actions(s)) for s in mdp.non_terminal_states } opt_policy: FinitePolicy[S, A] = FinitePolicy({ s: Constant(qvf.argmax((s, a) for a in mdp.actions(s))[1]) for s in mdp.non_terminal_states }) return opt_vf, opt_policy
def greedy_policy_from_vf(mdp: FiniteMarkovDecisionProcess[S, A], vf: V[S], gamma: float) -> FinitePolicy[S, A]: greedy_policy_dict: Dict[S, FiniteDistribution[A]] = {} for s in mdp.non_terminal_states: q_values: Iterator[Tuple[A, float]] = \ ((a, mdp.mapping[s][a].expectation( lambda s_r: s_r[1] + gamma * vf.get(s_r[0], 0.) )) for a in mdp.actions(s)) greedy_policy_dict[s] =\ Constant(max(q_values, key=operator.itemgetter(1))[0]) return FinitePolicy(greedy_policy_dict)
def initialize( mdp: FiniteMarkovDecisionProcess ) -> Tuple[V[S], FinitePolicy]: """Initialize value function and policy. Initialize the value function to zeros at each state, and initialize the policy to a random choice of the action space at each non-terminal state. :param mdp: Object representation of a finite Markov decision process :returns: Value function initialized at zeros for each state :returns: Random Initial policy """ # Set value function at each state equal to zero v_0: V[S] = {s: 0 for s in mdp.states()} # Set the policy to be a random choice of the action space at each state pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states} ) return v_0, pi_0
def get_sarsa_vf_and_policy( self, states_actions_dict: Mapping[Cell, Optional[Set[Move]]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01 ) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-block cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items() if actions is not None} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): epsilon: float = 1.0 / (episode_num + 1) state: Cell = uniform_states.sample() action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon) while state in nt_states: next_state, reward = sample_func(state, action) if next_state in nt_states: next_action: Move = WindyGrid.epsilon_greedy_action( next_state, q, epsilon) q[state][action] += step_size * \ (reward + q[next_state][next_action] - q[state][action]) action = next_action else: q[state][action] += step_size * (reward - q[state][action]) state = next_state vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()} policy: FinitePolicy[Cell, Move] = FinitePolicy({ s: Constant(max(d.items(), key=itemgetter(1))[0]) for s, d in q.items() }) return (vf_dict, policy)
def policy_iteration( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, approx0: FunctionApprox[S] ) -> Iterator[Tuple[FunctionApprox[S], FinitePolicy[S, A]]]: '''Calculate the value function (V*) of the given MDP by improving the policy repeatedly after evaluating the value function for a policy ''' def update(vf_policy: Tuple[FunctionApprox[S], FinitePolicy[S, A]])\ -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]: vf, pi = vf_policy mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) #policy_vf: FunctionApprox[S] = approximate_policy_evaluation_result(mdp,pi,vf) policy_vf: FunctionApprox[S] = evaluate_mrp_result(mrp, gamma, vf) improved_pi: FinitePolicy[S, A] = greedy_policy_from_approx_vf( mdp, policy_vf, gamma) return policy_vf, improved_pi pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}) return iterate(update, (approx0, pi_0))
def get_optimality(n: int) -> Tuple[FinitePolicy, np.ndarray]: fl_mdp = FrogAndLilypadsMDP(n) print(fl_mdp.get_action_transition_reward_map()) deterministic_policies = product("AB", repeat=n - 1) odp = None ovf = None for prod in deterministic_policies: policy_map = {0: None, n: None} for i in range(1, n): policy_map[i] = Categorical({prod[i - 1]: 1}) policy = FinitePolicy(policy_map) fl_mrp = fl_mdp.apply_finite_policy(policy) value_function = fl_mrp.get_value_function_vec(1) if odp == None: odp = policy odp_keys = prod ovf = value_function else: comparison = [(value_function[i] > ovf[i]) for i in range(n - 1)] if all(comparison): odp = policy odp_keys = prod ovf = value_function return ((odp_keys, ovf))
# start state distribution: every non-terminal state has equal probability to be the start state start_states = Categorical({ state: 1 / len(si_mdp.non_terminal_states) for state in si_mdp.non_terminal_states }) mc_tabular_control = mc_control(si_mdp, start_states, Tabular(start_map, start_map), user_gamma, 800) values_map = mc_tabular_control.values_map opt_vf, opt_pi = get_optimal_policy(values_map) print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi) fdp: FinitePolicy[InventoryState, int] = FinitePolicy({ InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta)) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha) }) implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \ si_mdp.apply_finite_policy(fdp) print("MDP Value Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma) print(opt_vf_vi, '\n') print(opt_policy_vi) print("MDP Policy Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_pi, opt_policy_pi = policy_iteration_result(si_mdp, gamma=user_gamma)
def compare_mc_sarsa_ql(fmdp: FiniteMarkovDecisionProcess[S, A], method_mask: Tuple[bool, bool, bool], learning_rates: Sequence[Tuple[float, float, float]], gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], q_learning_epsilon: float, mc_episode_length_tol: float, num_episodes: int, plot_batch: int, plot_start: int) -> None: true_vf: V[S] = value_iteration_result(fmdp, gamma)[0] states: Sequence[S] = fmdp.non_terminal_states colors: Sequence[str] = ['b', 'g', 'r', 'k', 'c', 'm', 'y'] import matplotlib.pyplot as plt plt.figure(figsize=(11, 7)) if method_mask[0]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): mc_funcs_it: Iterator[FunctionApprox[Tuple[S, A]]] = \ glie_mc_finite_control_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=mc_episode_length_tol ) mc_errors = [] batch_mc_errs = [] for i, mc_qvf in enumerate( itertools.islice(mc_funcs_it, num_episodes)): mc_vf: V[S] = { s: max(mc_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_mc_errs.append( sqrt( sum((mc_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % plot_batch == plot_batch - 1: mc_errors.append(sum(batch_mc_errs) / plot_batch) batch_mc_errs = [] mc_plot = mc_errors[plot_start:] label = f"MC InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(mc_plot)), mc_plot, color=colors[k], linestyle='-', label=label) sample_episodes: int = 1000 uniform_policy: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(fmdp.actions(s))) for s in states}) fmrp: FiniteMarkovRewardProcess[S] = \ fmdp.apply_finite_policy(uniform_policy) td_episode_length: int = int( round( sum( len( list( returns(trace=fmrp.simulate_reward(Choose(set( states))), γ=gamma, tolerance=mc_episode_length_tol))) for _ in range(sample_episodes)) / sample_episodes)) if method_mask[1]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): sarsa_funcs_it: Iterator[FunctionApprox[Tuple[S, A]]] = \ glie_sarsa_finite_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=td_episode_length, ) sarsa_errors = [] transitions_batch = plot_batch * td_episode_length batch_sarsa_errs = [] for i, sarsa_qvf in enumerate( itertools.islice(sarsa_funcs_it, num_episodes * td_episode_length)): sarsa_vf: V[S] = { s: max(sarsa_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_sarsa_errs.append( sqrt( sum((sarsa_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % transitions_batch == transitions_batch - 1: sarsa_errors.append( sum(batch_sarsa_errs) / transitions_batch) batch_sarsa_errs = [] sarsa_plot = sarsa_errors[plot_start:] label = f"SARSA InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(sarsa_plot)), sarsa_plot, color=colors[k], linestyle='--', label=label) if method_mask[2]: for k, (init_lr, half_life, exponent) in enumerate(learning_rates): ql_funcs_it: Iterator[FunctionApprox[Tuple[S, A]]] = \ q_learning_finite_learning_rate( fmdp=fmdp, initial_learning_rate=init_lr, half_life=half_life, exponent=exponent, gamma=gamma, epsilon=q_learning_epsilon, max_episode_length=td_episode_length, ) ql_errors = [] transitions_batch = plot_batch * td_episode_length batch_ql_errs = [] for i, ql_qvf in enumerate( itertools.islice(ql_funcs_it, num_episodes * td_episode_length)): ql_vf: V[S] = { s: max(ql_qvf((s, a)) for a in fmdp.actions(s)) for s in states } batch_ql_errs.append( sqrt( sum((ql_vf[s] - true_vf[s])**2 for s in states) / len(states))) if i % transitions_batch == transitions_batch - 1: ql_errors.append(sum(batch_ql_errs) / transitions_batch) batch_ql_errs = [] ql_plot = ql_errors[plot_start:] label = f"Q-Learning InitRate={init_lr:.3f},HalfLife" + \ f"={half_life:.0f},Exp={exponent:.1f}" plt.plot(range(len(ql_plot)), ql_plot, color=colors[k], linestyle=':', label=label) plt.xlabel("Episode Batches", fontsize=20) plt.ylabel("Optimal Value Function RMSE", fontsize=20) plt.title("RMSE as function of episode batches", fontsize=20) plt.grid(True) plt.legend(fontsize=10) plt.show()
FrogEscapeMDP( n = 6, initial_pad = 1 ) print("MDP Transition Map") print("------------------") print(fe_mdp) # setup deterministic policy fdp: FinitePolicy[FrogEscapeState, int] = FinitePolicy( { FrogEscapeState(i): Categorical({ 0: 0.5, 1: 0.5 }) for i in range(1, fe_mdp.n) } # {FrogEscapeState(i): # Categorial({}) if i == 0 else (Constant(1) if i == fe_mdp.n else Constant(1)) for i in range(fe_mdp.n+1)} ) print("Policy Map") print("----------") print(fdp) implied_mrp: FiniteMarkovRewardProcess[FrogEscapeState] =\ fe_mdp.apply_finite_policy(fdp) print("Implied MP Transition Map") print("--------------") print(FiniteMarkovProcess(implied_mrp.transition_map))
steps = 8 pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)] cp: ClearancePricingMDP = ClearancePricingMDP( initial_inventory=ii, time_steps=steps, price_lambda_pairs=pairs ) print("Clearance Pricing MDP") print("---------------------") print(cp.mdp) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FinitePolicy[int, int] = FinitePolicy( {s: Constant(policy_func(s)) for s in range(ii + 1)} ) single_step_mrp: FiniteMarkovRewardProcess[int] = \ cp.single_step_mdp.apply_finite_policy(stationary_policy) vf_for_policy: Iterator[V[int]] = evaluate( unwrap_finite_horizon_MRP(finite_horizon_MRP(single_step_mrp, steps)), 1. ) print("Value Function for Stationary Policy") print("------------------------------------") for t, vf in enumerate(vf_for_policy): print(f"Time Step {t:d}") print("---------------")