def get_opt_vf_and_policy(self) -> \ Iterator[Tuple[V[int], FinitePolicy[int, bool]]]: dt: float = self.dt() up_factor: float = np.exp(self.vol * np.sqrt(dt)) up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \ (up_factor * up_factor - 1) return optimal_vf_and_policy( steps=[ {j: None if j == -1 else { True: Constant( ( -1, self.payoff(i * dt, self.state_price(i, j)) ) ), False: Categorical( { (j + 1, 0.): up_prob, (j, 0.): 1 - up_prob } ) } for j in range(i + 1)} for i in range(self.num_steps + 1) ], gamma=np.exp(-self.rate * dt) )
def test_value_iteration(self): vpstar = optimal_vf_and_policy(self.mdp_seq, 1.) states = self.single_step_mdp.states() fa_dynamic = Dynamic({s: 0.0 for s in states}) fa_tabular = Tabular() distribution = Choose(set(states)) approx_vpstar_finite = back_opt_vf_and_policy_finite( [(self.mdp_seq[i], fa_dynamic) for i in range(self.steps)], 1. ) approx_vpstar = back_opt_vf_and_policy( [(self.single_step_mdp, fa_tabular, distribution) for _ in range(self.steps)], 1., num_state_samples=120, error_tolerance=0.01 ) for t, ((v1, _), (v2, _), (v3, _)) in enumerate(zip( vpstar, approx_vpstar_finite, approx_vpstar )): states = self.mdp_seq[t].keys() v1_arr = np.array([v1[s] for s in states]) v2_arr = v2.evaluate(states) v3_arr = v3.evaluate(states) self.assertLess(max(abs(v1_arr - v2_arr)), 0.001) self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
def test_optimal_policy(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) steps = unwrap_finite_horizon_MDP(finite) *v_ps, (_, p) = optimal_vf_and_policy(steps, gamma=1) for _, a in p.action_for.items(): self.assertEqual(a, False) self.assertAlmostEqual(v_ps[0][0][NonTerminal(True)], 17) self.assertAlmostEqual(v_ps[5][0][NonTerminal(False)], 17 / 2)
def test_optimal_policy(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) steps = unwrap_finite_horizon_MDP(finite) *v_ps, (v, p) = optimal_vf_and_policy(steps, gamma=1) for s in p.states(): self.assertEqual(p.act(s), Constant(False)) self.assertAlmostEqual(v_ps[0][0][True], 17) self.assertAlmostEqual(v_ps[5][0][False], 17 / 2)
def get_optimal_vf_and_policy(self)\ -> Iterator[Tuple[V[int], FinitePolicy[int, int]]]: return optimal_vf_and_policy(unwrap_finite_horizon_MDP(self.mdp), 1.)