def get_vf_for_policy( self, policy: FinitePolicy[WithTime[int], int] ) -> Iterator[V[int]]: mrp: FiniteMarkovRewardProcess[WithTime[int]] \ = self.mdp.apply_finite_policy(policy) return evaluate(unwrap_finite_horizon_MRP(mrp), 1.)
def test_evaluate_mrp(self): vf = evaluate(self.mrp_seq, 1.) states = self.single_step_mrp.states() fa_dynamic = Dynamic({s: 0.0 for s in states}) fa_tabular = Tabular() distribution = Choose(set(states)) approx_vf_finite = backward_evaluate_finite( [(self.mrp_seq[i], fa_dynamic) for i in range(self.steps)], 1. ) approx_vf = backward_evaluate( [(self.single_step_mrp, fa_tabular, distribution) for _ in range(self.steps)], 1., num_state_samples=120, error_tolerance=0.01 ) for t, (v1, v2, v3) in enumerate(zip( vf, approx_vf_finite, approx_vf )): states = self.mrp_seq[t].keys() v1_arr = np.array([v1[s] for s in states]) v2_arr = v2.evaluate(states) v3_arr = v3.evaluate(states) self.assertLess(max(abs(v1_arr - v2_arr)), 0.001) self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
def test_evaluate(self): process = finite_horizon_MRP(self.finite_flip_flop, 10) vs = list(evaluate(unwrap_finite_horizon_MRP(process), gamma=1)) self.assertEqual(len(vs), 10) self.assertAlmostEqual(vs[0][NonTerminal(True)], 17) self.assertAlmostEqual(vs[0][NonTerminal(False)], 17) self.assertAlmostEqual(vs[5][NonTerminal(True)], 17 / 2) self.assertAlmostEqual(vs[5][NonTerminal(False)], 17 / 2) self.assertAlmostEqual(vs[9][NonTerminal(True)], 17 / 10) self.assertAlmostEqual(vs[9][NonTerminal(False)], 17 / 10)
def test_compare_to_backward_induction(self): finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10) v = evaluate_mrp_result(finite_horizon, gamma=1) self.assertEqual(len(v), 20) finite_v =\ list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1)) for time in range(0, 10): self.assertAlmostEqual(v[WithTime(state=True, time=time)], finite_v[time][True]) self.assertAlmostEqual(v[WithTime(state=False, time=time)], finite_v[time][False])
def test_compare_to_backward_induction(self): finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10) start = Dynamic({s: 0.0 for s in finite_horizon.states()}) v = FunctionApprox.converged( evaluate_finite_mrp(finite_horizon, γ=1, approx_0=start)) self.assertEqual(len(v.values_map), 22) finite_v =\ list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1)) for time in range(0, 10): self.assertAlmostEqual(v(WithTime(state=True, time=time)), finite_v[time][True]) self.assertAlmostEqual(v(WithTime(state=False, time=time)), finite_v[time][False])
print("Clearance Pricing MDP") print("---------------------") print(cp.mdp) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FinitePolicy[int, int] = FinitePolicy( {s: Constant(policy_func(s)) for s in range(ii + 1)} ) single_step_mrp: FiniteMarkovRewardProcess[int] = \ cp.single_step_mdp.apply_finite_policy(stationary_policy) vf_for_policy: Iterator[V[int]] = evaluate( unwrap_finite_horizon_MRP(finite_horizon_MRP(single_step_mrp, steps)), 1. ) print("Value Function for Stationary Policy") print("------------------------------------") for t, vf in enumerate(vf_for_policy): print(f"Time Step {t:d}") print("---------------") pprint(vf) print("Optimal Value Function and Optimal Policy") print("------------------------------------") prices = [] for t, (vf, policy) in enumerate(cp.get_optimal_vf_and_policy()): print(f"Time Step {t:d}") print("---------------")