def get_q_learning_vf_and_policy( self, states_actions_dict: Mapping[Cell, Set[Move]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01, epsilon: float = 0.1 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-block cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items()} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): state: Cell = uniform_states.sample() ''' write your code here update the dictionary q initialized above according to the Q-learning algorithm's Q-Value Function updates. ''' vf_dict: V[Cell] = {NonTerminal(s): max(d.values()) for s, d in q.items()} policy: FiniteDeterministicPolicy[Cell, Move] = \ FiniteDeterministicPolicy( {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()} ) return (vf_dict, policy)
def setUp(self): ii = 10 self.steps = 6 pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)] self.cp: ClearancePricingMDP = ClearancePricingMDP( initial_inventory=ii, time_steps=self.steps, price_lambda_pairs=pairs) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FiniteDeterministicPolicy[int, int] = \ FiniteDeterministicPolicy( {s: policy_func(s) for s in range(ii + 1)} ) self.single_step_mrp: FiniteMarkovRewardProcess[int] = \ self.cp.single_step_mdp.apply_finite_policy(stationary_policy) self.mrp_seq = unwrap_finite_horizon_MRP( finite_horizon_MRP(self.single_step_mrp, self.steps)) self.single_step_mdp: FiniteMarkovDecisionProcess[int, int] = \ self.cp.single_step_mdp self.mdp_seq = unwrap_finite_horizon_MDP( finite_horizon_MDP(self.single_step_mdp, self.steps))
def setUp(self): user_capacity = 2 user_poisson_lambda = 1.0 user_holding_cost = 1.0 user_stockout_cost = 10.0 self.gamma = 0.9 self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) self.fdp: FiniteDeterministicPolicy[InventoryState, int] = \ FiniteDeterministicPolicy( {InventoryState(alpha, beta): user_capacity - (alpha + beta) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)} ) self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\ self.si_mdp.apply_finite_policy(self.fdp) self.states: Sequence[NonTerminal[InventoryState]] = \ self.implied_mrp.non_terminal_states
def get_vf_and_policy_from_qvf( mdp: FiniteMarkovDecisionProcess[S, A], qvf: QValueFunctionApprox[S, A] ) -> Tuple[V[S], FiniteDeterministicPolicy[S, A]]: opt_vf: V[S] = { s: max(qvf((s, a)) for a in mdp.actions(s)) for s in mdp.non_terminal_states } opt_policy: FiniteDeterministicPolicy[S, A] = \ FiniteDeterministicPolicy({ s.state: qvf.argmax((s, a) for a in mdp.actions(s))[1] for s in mdp.non_terminal_states }) return opt_vf, opt_policy
def greedy_policy_from_vf(mdp: FiniteMarkovDecisionProcess[S, A], vf: V[S], gamma: float) -> FiniteDeterministicPolicy[S, A]: greedy_policy_dict: Dict[S, A] = {} for s in mdp.non_terminal_states: q_values: Iterator[Tuple[A, float]] = \ ((a, mdp.mapping[s][a].expectation( lambda s_r: s_r[1] + gamma * extended_vf(vf, s_r[0]) )) for a in mdp.actions(s)) greedy_policy_dict[s.state] = \ max(q_values, key=operator.itemgetter(1))[0] return FiniteDeterministicPolicy(greedy_policy_dict)
def get_sarsa_vf_and_policy( self, states_actions_dict: Mapping[Cell, Set[Move]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-terminal cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items()} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): epsilon: float = 1.0 / (episode_num + 1) state: Cell = uniform_states.sample() action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon) while state in nt_states: next_state, reward = sample_func(state, action) if next_state in nt_states: next_action: Move = WindyGrid.epsilon_greedy_action( next_state, q, epsilon) q[state][action] += step_size * \ (reward + q[next_state][next_action] - q[state][action]) action = next_action else: q[state][action] += step_size * (reward - q[state][action]) state = next_state vf_dict: V[Cell] = { NonTerminal(s): max(d.values()) for s, d in q.items() } policy: FiniteDeterministicPolicy[Cell, Move] = \ FiniteDeterministicPolicy( {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()} ) return vf_dict, policy
def optimal_vf_and_policy( steps: Sequence[StateActionMapping[S, A]], gamma: float ) -> Iterator[Tuple[V[S], FiniteDeterministicPolicy[S, A]]]: '''Use backwards induction to find the optimal value function and optimal policy at each time step ''' v_p: List[Tuple[V[S], FiniteDeterministicPolicy[S, A]]] = [] for step in reversed(steps): this_v: Dict[NonTerminal[S], float] = {} this_a: Dict[S, A] = {} for s, actions_map in step.items(): action_values = ((res.expectation(lambda s_r: s_r[1] + gamma * ( extended_vf(v_p[-1][0], s_r[0]) if len(v_p) > 0 else 0.)), a) for a, res in actions_map.items()) v_star, a_star = max(action_values, key=itemgetter(0)) this_v[s] = v_star this_a[s.state] = a_star v_p.append((this_v, FiniteDeterministicPolicy(this_a))) return reversed(v_p)
from pprint import pprint ii = 12 steps = 8 pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)] cp: ClearancePricingMDP = ClearancePricingMDP(initial_inventory=ii, time_steps=steps, price_lambda_pairs=pairs) print("Clearance Pricing MDP") print("---------------------") print(cp.mdp) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FiniteDeterministicPolicy[int, int] = \ FiniteDeterministicPolicy({s: policy_func(s) for s in range(ii + 1)}) single_step_mrp: FiniteMarkovRewardProcess[int] = \ cp.single_step_mdp.apply_finite_policy(stationary_policy) vf_for_policy: Iterator[V[int]] = evaluate( unwrap_finite_horizon_MRP(finite_horizon_MRP(single_step_mrp, steps)), 1.) print("Value Function for Stationary Policy") print("------------------------------------") for t, vf in enumerate(vf_for_policy): print(f"Time Step {t:d}") print("---------------") pprint(vf)
si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) print("MDP Transition Map") print("------------------") print(si_mdp) fdp: FiniteDeterministicPolicy[InventoryState, int] = \ FiniteDeterministicPolicy( {InventoryState(alpha, beta): user_capacity - (alpha + beta) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)} ) print("Deterministic Policy Map") print("------------------------") print(fdp) implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\ si_mdp.apply_finite_policy(fdp) print("Implied MP Transition Map") print("--------------") print( FiniteMarkovProcess({ s.state: Categorical({s1.state: p for s1, p in v.table().items()})