def test_converge(self): def close(a, b): return abs(a - b) < 0.1 ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1)) self.assertAlmostEqual(converged(ns, close), 0.33, places=2) ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1)) all_ns = [1.0, 0.5, 0.33] for got, expected in zip(converge(ns, close), all_ns): self.assertAlmostEqual(got, expected, places=2)
def policy_iteration( mdp: MarkovDecisionProcess[S, A], γ: float, approx_v_0: FunctionApprox[S], non_terminal_states_distribution: Distribution[S], num_state_samples: int ) -> Iterator[Tuple[FunctionApprox[S], ThisPolicy[S, A]]]: def update(vf_policy: Tuple[FunctionApprox[S], ThisPolicy[S, A]]) \ -> Tuple[FunctionApprox[S], ThisPolicy[S, A]]: nt_states: Sequence[S] = non_terminal_states_distribution\ .sample_n(num_state_samples) vf, pi = vf_policy mrp: MarkovRewardProcess[S] = mdp.apply_policy(pi) new_vf: FunctionApprox[S] = converged( evaluate_mrp(mrp, γ, vf, non_terminal_states_distribution, num_state_samples), done=lambda a, b: a.within(b, 1e-4) ) def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + γ * new_vf.evaluate([s1]).item() return (new_vf.update([(s, max(mdp.step(s, a).expectation(return_) for a in mdp.actions(s))) for s in nt_states]), ThisPolicy(mdp, return_)) def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + γ * approx_v_0.evaluate([s1]).item() return iterate(update, (approx_v_0, ThisPolicy(mdp, return_)))
def value_iteration( mdp: MarkovDecisionProcess[S, A], γ: float, approx_0: FunctionApprox[S], non_terminal_states_distribution: Distribution[S], num_state_samples: int, ) -> Iterator[FunctionApprox[S]]: """Iteratively calculate the Optimal Value function for the given Markov Decision Process, using the given FunctionApprox to approximate the Optimal Value function at each step for a random sample of the process' non-terminal states. """ def update(v: FunctionApprox[S]) -> FunctionApprox[S]: nt_states: Sequence[S] = non_terminal_states_distribution.sample_n( num_state_samples ) def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + γ * v.evaluate([s1]).item() return v.update( [ (s, max(mdp.step(s, a).expectation(return_) for a in mdp.actions(s))) for s in nt_states ] ) return iterate(update, approx_0)
def policy_iteration( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, matrix_method_for_mrp_eval: bool = False ) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]: '''Calculate the value function (V*) of the given MDP by improving the policy repeatedly after evaluating the value function for a policy ''' def update(vf_policy: Tuple[V[S], FinitePolicy[S, A]])\ -> Tuple[V[S], FinitePolicy[S, A]]: vf, pi = vf_policy mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) policy_vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate(mrp.get_value_function_vec(gamma))}\ if matrix_method_for_mrp_eval else evaluate_mrp_result(mrp, gamma) improved_pi: FinitePolicy[S, A] = greedy_policy_from_vf( mdp, policy_vf, gamma) return policy_vf, improved_pi v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states} pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}) return iterate(update, (v_0, pi_0))
def evaluate_mrp( mrp: MarkovRewardProcess[S], γ: float, approx_0: FunctionApprox[S], non_terminal_states_distribution: Distribution[S], num_state_samples: int, ) -> Iterator[FunctionApprox[S]]: """Iteratively calculate the value function for the given Markov Reward Process, using the given FunctionApprox to approximate the value function at each step for a random sample of the process' non-terminal states. """ def update(v: FunctionApprox[S]) -> FunctionApprox[S]: nt_states: Sequence[S] = non_terminal_states_distribution.sample_n( num_state_samples ) def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + γ * v.evaluate([s1]).item() return v.update( [(s, mrp.transition_reward(s).expectation(return_)) for s in nt_states] ) return iterate(update, approx_0)
def value_iteration_finite( mdp: FiniteMarkovDecisionProcess[S, A], γ: float, approx_0: FunctionApprox[S] ) -> Iterator[FunctionApprox[S]]: '''Iteratively calculate the Optimal Value function for the given finite Markov Decision Process, using the given FunctionApprox to approximate the Optimal Value function at each step ''' def update(v: FunctionApprox[S]) -> FunctionApprox[S]: def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + γ * v.evaluate([s1]).item() return v.update( [( s, max(mdp.mapping[s][a].expectation(return_) for a in mdp.actions(s)) ) for s in mdp.non_terminal_states] ) return iterate(update, approx_0)
def value_iteration( mdp: MarkovDecisionProcess[S, A], γ: float, approx_0: ValueFunctionApprox[S], non_terminal_states_distribution: NTStateDistribution[S], num_state_samples: int) -> Iterator[ValueFunctionApprox[S]]: '''Iteratively calculate the Optimal Value function for the given Markov Decision Process, using the given FunctionApprox to approximate the Optimal Value function at each step for a random sample of the process' non-terminal states. ''' def update(v: ValueFunctionApprox[S]) -> ValueFunctionApprox[S]: nt_states: Sequence[NonTerminal[S]] = \ non_terminal_states_distribution.sample_n(num_state_samples) def return_(s_r: Tuple[State[S], float]) -> float: s1, r = s_r return r + γ * extended_vf(v, s1) return v.update([ (s, max(mdp.step(s, a).expectation(return_) for a in mdp.actions(s))) for s in nt_states ]) return iterate(update, approx_0)
def evaluate_mrp(mrp: FiniteMarkovRewardProcess[S], gamma: float) -> Iterator[np.ndarray]: """Iteratively calculate the value function for the give Markov reward process. """ def update(v: np.ndarray) -> np.ndarray: return mrp.reward_function_vec + gamma * mrp.get_transition_matrix( ).dot(v) v_0: np.ndarray = np.zeros(len(mrp.non_terminal_states)) return iterate(update, v_0)
def evaluate_finite_mrp( mrp: FiniteMarkovRewardProcess[S], γ: float, approx_0: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]: '''Iteratively calculate the value function for the give finite Markov Reward Process, using the given FunctionApprox to approximate the value function at each step. ''' def update(v: FunctionApprox[S]) -> FunctionApprox[S]: vs: np.ndarray = v.evaluate(mrp.non_terminal_states) updated: np.ndarray = mrp.reward_function_vec + γ * \ mrp.get_transition_matrix().dot(vs) return v.update(zip(mrp.states(), updated)) return iterate(update, approx_0)
def value_iteration(mdp: FiniteMarkovDecisionProcess[S, A], gamma: float) -> Iterator[V[S]]: """Calculate the value function (V*) of the given MDP by applying the update function repeatedly until the values converge. """ def update(v: V[S]) -> V[S]: return { s: max(mdp.mapping[s][a].expectation( lambda s_r: s_r[1] + gamma * v.get(s_r[0], 0.0)) for a in mdp.actions(s)) for s in v } v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states} return iterate(update, v_0)
def approximate_policy_evaluation(mdp: FiniteMarkovDecisionProcess[S, A], policy: FinitePolicy[S, A], vf: FunctionApprox[S], gamma: float) -> Iterator[FunctionApprox[S]]: def update(v: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]: def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + gamma * v.evaluate([s1]).item() #print(type(v)) return v.update([ (s, mdp.mapping[s][policy.policy_map[s]].expectation(return_)) for s in mdp.non_terminal_states ]) return iterate(update, vf)
def evaluate_mrp(mrp: MarkovRewardProcess[S], γ: float, approx_0: ValueFunctionApprox[S], non_terminal_states_distribution: NTStateDistribution[S], num_state_samples: int) -> Iterator[ValueFunctionApprox[S]]: '''Iteratively calculate the value function for the given Markov Reward Process, using the given FunctionApprox to approximate the value function at each step for a random sample of the process' non-terminal states. ''' def update(v: ValueFunctionApprox[S]) -> ValueFunctionApprox[S]: nt_states: Sequence[NonTerminal[S]] = \ non_terminal_states_distribution.sample_n(num_state_samples) def return_(s_r: Tuple[State[S], float]) -> float: s1, r = s_r return r + γ * extended_vf(v, s1) return v.update([(s, mrp.transition_reward(s).expectation(return_)) for s in nt_states]) return iterate(update, approx_0)
def policy_iteration( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, approx0: FunctionApprox[S] ) -> Iterator[Tuple[FunctionApprox[S], FinitePolicy[S, A]]]: '''Calculate the value function (V*) of the given MDP by improving the policy repeatedly after evaluating the value function for a policy ''' def update(vf_policy: Tuple[FunctionApprox[S], FinitePolicy[S, A]])\ -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]: vf, pi = vf_policy mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) #policy_vf: FunctionApprox[S] = approximate_policy_evaluation_result(mdp,pi,vf) policy_vf: FunctionApprox[S] = evaluate_mrp_result(mrp, gamma, vf) improved_pi: FinitePolicy[S, A] = greedy_policy_from_approx_vf( mdp, policy_vf, gamma) return policy_vf, improved_pi pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}) return iterate(update, (approx0, pi_0))
def test_iterate(self): ns = iterate(lambda x: x + 1, start=0) self.assertEqual(list(itertools.islice(ns, 5)), list(range(0, 5)))