def test_evaluate_mrp(self): v = evaluate_mrp_result(self.finite_flip_flop, gamma=0.99) self.assertEqual(len(v), 2) for s in v: self.assertLess(abs(v[s] - 170), 0.1)
def test_compare_to_backward_induction(self): finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10) v = evaluate_mrp_result(finite_horizon, gamma=1) self.assertEqual(len(v), 20) finite_v =\ list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1)) for time in range(0, 10): self.assertAlmostEqual(v[WithTime(state=True, time=time)], finite_v[time][True]) self.assertAlmostEqual(v[WithTime(state=False, time=time)], finite_v[time][False])
print("---------------") implied_mrp.display_reward_function() print() print("Implied MRP Value Function") print("--------------") implied_mrp.display_value_function(gamma=user_gamma) print() from rl.dynamic_programming import evaluate_mrp_result from rl.dynamic_programming import policy_iteration_result from rl.dynamic_programming import value_iteration_result print("Implied MRP Policy Evaluation Value Function") print("--------------") pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma)) print() print("MDP Policy Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_pi, opt_policy_pi = policy_iteration_result(fe_mdp, gamma=user_gamma) pprint(opt_vf_pi) print(opt_policy_pi) print() print("MDP Value Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_vi, opt_policy_vi = value_iteration_result(fe_mdp, gamma=user_gamma) pprint(opt_vf_vi) print(opt_policy_vi)