class TestFiniteDistribution(unittest.TestCase): def setUp(self): self.die = Choose({1, 2, 3, 4, 5, 6}) self.ragged = Categorical({0: 0.9, 1: 0.05, 2: 0.025, 3: 0.025}) def test_map(self): plusOne = self.die.map(lambda x: x + 1) assert_almost_equal(self, plusOne, Choose({2, 3, 4, 5, 6, 7})) evenOdd = self.die.map(lambda x: x % 2 == 0) assert_almost_equal(self, evenOdd, Choose({True, False})) greaterThan4 = self.die.map(lambda x: x > 4) assert_almost_equal(self, greaterThan4, Categorical({ True: 1 / 3, False: 2 / 3 })) def test_expectation(self): self.assertAlmostEqual(self.die.expectation(float), 3.5) even = self.die.map(lambda n: n % 2 == 0) self.assertAlmostEqual(even.expectation(float), 0.5) self.assertAlmostEqual(self.ragged.expectation(float), 0.175)
def sarsa_control(start_states: Distribution[S], transition_fcn: Callable[[S, A], Tuple[S, float]], state_action: Mapping[S, List[A]], approx_0: FunctionApprox[Tuple[S, A]], gamma: float, ϵ: float) -> Iterable[FunctionApprox[Tuple[S, A]]]: """ Update Q-value function approximate using SARSA Initialize first state by start_states """ q = approx_0 state = start_states.sample() action = Choose(set(state_action[state])).sample() while True: # next_state, reward = transition_fcn(state, action) next_state, reward = transition_fcn[state][action].sample() # use ϵ-greedy policy to get next_action explore = Bernoulli(ϵ) if explore.sample(): next_action = Choose(set(state_action[next_state])).sample() else: next_action = state_action[next_state][np.argmax( [q((next_state, a)) for a in state_action[next_state]])] q = q.update([(state, action), reward + gamma * q( (next_state, next_action))]) state, action = next_state, next_action yield q
class TestChoose(unittest.TestCase): def setUp(self): self.one = Choose({1}) self.six = Choose({1, 2, 3, 4, 5, 6}) self.repeated = Choose([1, 1, 1, 2]) def test_choose(self): assert_almost_equal(self, self.one, Constant(1)) self.assertAlmostEqual(self.one.probability(1), 1.) self.assertAlmostEqual(self.one.probability(0), 0.) categorical_six = Categorical({x: 1 / 6 for x in range(1, 7)}) assert_almost_equal(self, self.six, categorical_six) self.assertAlmostEqual(self.six.probability(1), 1 / 6) self.assertAlmostEqual(self.six.probability(0), 0.) def test_repeated(self): counts = Counter(self.repeated.sample_n(1000)) self.assertLess(abs(counts[1] - 750), 50) self.assertLess(abs(counts[2] - 250), 50) table = self.repeated.table() self.assertAlmostEqual(table[1], 0.75) self.assertAlmostEqual(table[2], 0.25) counts = Counter(self.repeated.sample_n(1000)) self.assertLess(abs(counts[1] - 750), 50) self.assertLess(abs(counts[2] - 250), 50)
def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[bool, bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.states() for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1, ) uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({ s: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.states() }) transitions: Iterable[mdp.TransitionStep[ bool, bool]] = self.finite_mdp.simulate_actions( Choose(self.finite_mdp.states()), uniform_policy) qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99) q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last( cast(Iterator[Tabular[Tuple[bool, bool]]], itertools.islice(qs, 20000))) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [True, False]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False
def lspi_transitions(self) -> Iterator[TransitionStep[int, int]]: states_distribution: Choose[NonTerminal[int]] = \ Choose(self.non_terminal_states) while True: state: NonTerminal[int] = states_distribution.sample() action: int = Choose(range(state.state)).sample() next_state, reward = self.step(state, action).sample() transition: TransitionStep[int, int] = TransitionStep( state=state, action=action, next_state=next_state, reward=reward) yield transition
def test_map(self): plusOne = self.die.map(lambda x: x + 1) assert_almost_equal(self, plusOne, Choose({2, 3, 4, 5, 6, 7})) evenOdd = self.die.map(lambda x: x % 2 == 0) assert_almost_equal(self, evenOdd, Choose({True, False})) greaterThan4 = self.die.map(lambda x: x > 4) assert_almost_equal(self, greaterThan4, Categorical({ True: 1 / 3, False: 2 / 3 }))
class TestChoose(unittest.TestCase): def setUp(self): self.one = Choose({1}) self.six = Choose({1, 2, 3, 4, 5, 6}) def test_choose(self): assert_almost_equal(self, self.one, Constant(1)) self.assertAlmostEqual(self.one.probability(1), 1.) self.assertAlmostEqual(self.one.probability(0), 0.) categorical_six = Categorical({x: 1 / 6 for x in range(1, 7)}) assert_almost_equal(self, self.six, categorical_six) self.assertAlmostEqual(self.six.probability(1), 1 / 6) self.assertAlmostEqual(self.six.probability(0), 0.)
class TestDistribution(unittest.TestCase): def setUp(self): self.finite = Choose(range(0, 6)) self.sampled = SampledDistribution(lambda: self.finite.sample(), 100000) def test_expectation(self): expected_finite = self.finite.expectation(lambda x: x) expected_sampled = self.sampled.expectation(lambda x: x) self.assertLess(abs(expected_finite - expected_sampled), 0.02) def test_sample_n(self): samples = self.sampled.sample_n(10) self.assertEqual(len(samples), 10) self.assertTrue(all(0 <= s < 6 for s in samples))
def test_evaluate_mrp(self): start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()}) v = iterate.converged( evaluate_mrp( self.finite_flip_flop, γ=0.99, approx_0=start, non_terminal_states_distribution=Choose( set(self.finite_flip_flop.states())), num_state_samples=5, ), done=lambda a, b: a.within(b, 1e-4), ) self.assertEqual(len(v.values_map), 2) for s in v.values_map: self.assertLess(abs(v(s) - 170), 1.0) v_finite = iterate.converged( evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start), done=lambda a, b: a.within(b, 1e-4), ) assert_allclose(v.evaluate([True, False]), v_finite.evaluate([True, False]), rtol=0.01)
def lspi(memory: List[mdp.TransitionStep[S]], feature_map: Dict[Tuple[S, A], List[float]], state_action: Dict[S, List[A]], m: int, gamma: float, ϵ: float) -> Iterable[Dict[Tuple[S, A], float]]: """ update A and b to get w*= inverse(A)b and update deterministic policy feature_map: key: state, value: phi(s_i) is a vector of dimension m """ # initialize A, b A = np.random.rand(m, m) b = np.zeros((m, 1)) w = np.linalg.inv(A) @ b while True: transition = random.choice(memory) state = transition.state next_state = transition.next_state feature_state = np.array(feature_map[(state, transition.action)]) # next_action is derived from ϵ-policy explore = Bernoulli(ϵ) if explore.sample(): next_action = Choose(set(state_action[next_state])).sample() else: next_action = state_action[next_state][np.argmax([ np.array(feature_map[(next_state, action)]) @ w for action in state_action[next_state] ])] feature_next_state = np.array(feature_map[(next_state, next_action)]) A += feature_state @ (feature_state - gamma * feature_next_state).T b += feature_state * transition.reward w = np.linalg.inv(A) @ b yield { s_a: np.array(feature_map[s_a]) @ w for s_a in feature_map.keys() }
def test_value_iteration(self): mdp_map: Mapping[NonTerminal[InventoryState], float] = value_iteration_result( self.si_mdp, self.gamma)[0] # print(mdp_map) mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states]) fa = Dynamic({s: 0.0 for s in self.states}) mdp_finite_fa = iterate.converged(value_iteration_finite( self.si_mdp, self.gamma, fa), done=lambda a, b: a.within(b, 1e-5)) # print(mdp_finite_fa.values_map) mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.01) mdp_fa = iterate.converged(value_iteration(self.si_mdp, self.gamma, fa, Choose(self.states), num_state_samples=30), done=lambda a, b: a.within(b, 1e-5)) # print(mdp_fa.values_map) mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 0.01)
def test_value_iteration(self): vpstar = optimal_vf_and_policy(self.mdp_seq, 1.) states = self.single_step_mdp.states() fa_dynamic = Dynamic({s: 0.0 for s in states}) fa_tabular = Tabular() distribution = Choose(set(states)) approx_vpstar_finite = back_opt_vf_and_policy_finite( [(self.mdp_seq[i], fa_dynamic) for i in range(self.steps)], 1. ) approx_vpstar = back_opt_vf_and_policy( [(self.single_step_mdp, fa_tabular, distribution) for _ in range(self.steps)], 1., num_state_samples=120, error_tolerance=0.01 ) for t, ((v1, _), (v2, _), (v3, _)) in enumerate(zip( vpstar, approx_vpstar_finite, approx_vpstar )): states = self.mdp_seq[t].keys() v1_arr = np.array([v1[s] for s in states]) v2_arr = v2.evaluate(states) v3_arr = v3.evaluate(states) self.assertLess(max(abs(v1_arr - v2_arr)), 0.001) self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
def test_evaluate_mrp(self): vf = evaluate(self.mrp_seq, 1.) states = self.single_step_mrp.states() fa_dynamic = Dynamic({s: 0.0 for s in states}) fa_tabular = Tabular() distribution = Choose(set(states)) approx_vf_finite = backward_evaluate_finite( [(self.mrp_seq[i], fa_dynamic) for i in range(self.steps)], 1. ) approx_vf = backward_evaluate( [(self.single_step_mrp, fa_tabular, distribution) for _ in range(self.steps)], 1., num_state_samples=120, error_tolerance=0.01 ) for t, (v1, v2, v3) in enumerate(zip( vf, approx_vf_finite, approx_vf )): states = self.mrp_seq[t].keys() v1_arr = np.array([v1[s] for s in states]) v2_arr = v2.evaluate(states) v3_arr = v3.evaluate(states) self.assertLess(max(abs(v1_arr - v2_arr)), 0.001) self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
def test_evaluate_mrp(self): mrp_vf1: np.ndarray = self.implied_mrp.get_value_function_vec( self.gamma) # print({s: mrp_vf1[i] for i, s in enumerate(self.states)}) fa = Dynamic({s: 0.0 for s in self.states}) mrp_finite_fa = iterate.converged( evaluate_finite_mrp(self.implied_mrp, self.gamma, fa), done=lambda a, b: a.within(b, 1e-4), ) # print(mrp_finite_fa.values_map) mrp_vf2: np.ndarray = mrp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mrp_vf1 - mrp_vf2)), 0.001) mrp_fa = iterate.converged( evaluate_mrp( self.implied_mrp, self.gamma, fa, Choose(self.states), num_state_samples=30, ), done=lambda a, b: a.within(b, 0.1), ) # print(mrp_fa.values_map) mrp_vf3: np.ndarray = mrp_fa.evaluate(self.states) self.assertLess(max(abs(mrp_vf1 - mrp_vf3)), 1.0)
def glie_mc_finite_control_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float = 1e-5 ) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return mc.glie_mc_control( mdp=fmdp, states=Choose(fmdp.non_terminal_states), approx_0=Tabular(values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=episode_length_tolerance)
def test_evaluate_finite_mrp(self) -> None: start = Tabular( {s: 0.0 for s in self.finite_flip_flop.states()}, count_to_weight_func=lambda _: 0.1, ) episode_length = 20 episodes: Iterable[Iterable[ mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces( Choose({True, False})) transitions: Iterable[ mp.TransitionStep[bool]] = itertools.chain.from_iterable( itertools.islice(episode, episode_length) for episode in episodes) vs = td.td_prediction(transitions, γ=0.99, approx_0=start) v: Optional[Tabular[bool]] = iterate.last( itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000)) if v is not None: self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 3.0) else: assert False
def policy_iteration( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, matrix_method_for_mrp_eval: bool = False ) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]: '''Calculate the value function (V*) of the given MDP by improving the policy repeatedly after evaluating the value function for a policy ''' def update(vf_policy: Tuple[V[S], FinitePolicy[S, A]])\ -> Tuple[V[S], FinitePolicy[S, A]]: vf, pi = vf_policy mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) policy_vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate(mrp.get_value_function_vec(gamma))}\ if matrix_method_for_mrp_eval else evaluate_mrp_result(mrp, gamma) improved_pi: FinitePolicy[S, A] = greedy_policy_from_vf( mdp, policy_vf, gamma) return policy_vf, improved_pi v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states} pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}) return iterate(update, (v_0, pi_0))
def get_q_learning_vf_and_policy( self, states_actions_dict: Mapping[Cell, Optional[Set[Move]]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01, epsilon: float = 0.1) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-block cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items() if actions is not None} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): state: Cell = uniform_states.sample() ''' write your code here update the dictionary q initialized above according to the Q-learning algorithm's Q-Value Function updates. ''' vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()} policy: FinitePolicy[Cell, Move] = FinitePolicy({ s: Constant(max(d.items(), key=itemgetter(1))[0]) for s, d in q.items() }) return (vf_dict, policy)
def states_sampler_func() -> NonTerminal[PriceAndShares]: price: float = self.initial_price_distribution.sample() rem: int = self.shares for i in range(t): sell: int = Choose(range(rem + 1)).sample() price = self.price_dynamics[i](PriceAndShares( price=price, shares=rem)).sample() rem -= sell return NonTerminal(PriceAndShares(price=price, shares=rem))
def act(self, s: S) -> Optional[Distribution[A]]: if mdp.is_terminal(s): return None if explore.sample(): return Choose(set(mdp.actions(s))) _, action = q.argmax((s, a) for a in mdp.actions(s)) return Constant(action)
def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.non_terminal_states for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1 ) uniform_policy: FinitePolicy[bool, bool] =\ FinitePolicy({ s.state: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.non_terminal_states }) transitions: Iterable[mdp.TransitionStep[bool, bool]] =\ self.finite_mdp.simulate_actions( Choose(self.finite_mdp.non_terminal_states), uniform_policy ) qs = td.q_learning_external_transitions( transitions, self.finite_mdp.actions, q_0, γ=0.99 ) q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\ iterate.last( cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]], itertools.islice(qs, 20000)) ) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [NonTerminal(True), NonTerminal(False)]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False
def states_sampler_func() -> PriceAndShares: price: float = self.initial_price_distribution.sample() rem: int = self.shares x: float = self.init_x_distrib.sample() for i in range(t): sell: int = Choose(set(range(rem + 1))).sample() price = self.price_dynamics[i](PriceAndShares(price=price, shares=rem, x=x)).sample() rem -= sell new_x = self.pho * x + Uniform().sample() return PriceAndShares(price=price, shares=rem, x=new_x)
def test_evaluate_finite_mrp(self): start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()}) traces = self.finite_flip_flop.reward_traces(Choose({True, False})) v = iterate.converged( mc.evaluate_mrp(traces, γ=0.99, approx_0=start), # Loose bound of 0.025 to speed up test. done=lambda a, b: a.within(b, 0.025)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 1.0)
def glie_mc_finite_control_equal_wts( fmdp: FiniteMarkovDecisionProcess[S, A], gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], episode_length_tolerance: float = 1e-5, ) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} return mc.glie_mc_control( mdp=fmdp, states=Choose(fmdp.non_terminal_states), approx_0=Tabular(values_map=initial_qvf_dict), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, episode_length_tolerance=episode_length_tolerance)
def get_sarsa_vf_and_policy( self, states_actions_dict: Mapping[Cell, Set[Move]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-terminal cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items()} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): epsilon: float = 1.0 / (episode_num + 1) state: Cell = uniform_states.sample() action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon) while state in nt_states: next_state, reward = sample_func(state, action) if next_state in nt_states: next_action: Move = WindyGrid.epsilon_greedy_action( next_state, q, epsilon) q[state][action] += step_size * \ (reward + q[next_state][next_action] - q[state][action]) action = next_action else: q[state][action] += step_size * (reward - q[state][action]) state = next_state vf_dict: V[Cell] = { NonTerminal(s): max(d.values()) for s, d in q.items() } policy: FiniteDeterministicPolicy[Cell, Move] = \ FiniteDeterministicPolicy( {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()} ) return vf_dict, policy
def initialize( mdp: FiniteMarkovDecisionProcess ) -> Tuple[V[S], FinitePolicy]: """Initialize value function and policy. Initialize the value function to zeros at each state, and initialize the policy to a random choice of the action space at each non-terminal state. :param mdp: Object representation of a finite Markov decision process :returns: Value function initialized at zeros for each state :returns: Random Initial policy """ # Set value function at each state equal to zero v_0: V[S] = {s: 0 for s in mdp.states()} # Set the policy to be a random choice of the action space at each state pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states} ) return v_0, pi_0
def q_learning_finite_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon: float, max_episode_length: int) -> Iterator[QValueFunctionApprox[S, A]]: initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return td.q_learning(mdp=fmdp, policy_from_q=lambda f, m: mc.epsilon_greedy_policy( q=f, mdp=m, ϵ=epsilon), states=Choose(fmdp.non_terminal_states), approx_0=Tabular( values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, max_episode_length=max_episode_length)
def glie_sarsa_finite_learning_rate( fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float, half_life: float, exponent: float, gamma: float, epsilon_as_func_of_episodes: Callable[[int], float], max_episode_length: int) -> Iterator[FunctionApprox[Tuple[S, A]]]: initial_qvf_dict: Mapping[Tuple[S, A], float] = {(s, a): 0. for s in fmdp.non_terminal_states for a in fmdp.actions(s)} learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return td.glie_sarsa(mdp=fmdp, states=Choose(set(fmdp.non_terminal_states)), approx_0=Tabular( values_map=initial_qvf_dict, count_to_weight_func=learning_rate_func), γ=gamma, ϵ_as_func_of_episodes=epsilon_as_func_of_episodes, max_episode_length=max_episode_length)
def test_value_iteration(self): mdp_map: Mapping[InventoryState, float] = value_iteration_result( self.si_mdp, self.gamma)[0] # print(mdp_map) mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states]) fa = Dynamic({s: 0.0 for s in self.states}) mdp_finite_fa = FunctionApprox.converged( value_iteration_finite(self.si_mdp, self.gamma, fa)) # print(mdp_finite_fa.values_map) mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.001) mdp_fa = FunctionApprox.converged( value_iteration(self.si_mdp, self.gamma, fa, Choose(self.states), num_state_samples=30), 0.1) # print(mdp_fa.values_map) mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 1.0)
def policy_iteration( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, approx0: FunctionApprox[S] ) -> Iterator[Tuple[FunctionApprox[S], FinitePolicy[S, A]]]: '''Calculate the value function (V*) of the given MDP by improving the policy repeatedly after evaluating the value function for a policy ''' def update(vf_policy: Tuple[FunctionApprox[S], FinitePolicy[S, A]])\ -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]: vf, pi = vf_policy mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi) #policy_vf: FunctionApprox[S] = approximate_policy_evaluation_result(mdp,pi,vf) policy_vf: FunctionApprox[S] = evaluate_mrp_result(mrp, gamma, vf) improved_pi: FinitePolicy[S, A] = greedy_policy_from_approx_vf( mdp, policy_vf, gamma) return policy_vf, improved_pi pi_0: FinitePolicy[S, A] = FinitePolicy( {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}) return iterate(update, (approx0, pi_0))