def test_evaluate_mrp(self): start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()}) v = iterate.converged( evaluate_mrp( self.finite_flip_flop, γ=0.99, approx_0=start, non_terminal_states_distribution=Choose( set(self.finite_flip_flop.states())), num_state_samples=5, ), done=lambda a, b: a.within(b, 1e-4), ) self.assertEqual(len(v.values_map), 2) for s in v.values_map: self.assertLess(abs(v(s) - 170), 1.0) v_finite = iterate.converged( evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start), done=lambda a, b: a.within(b, 1e-4), ) assert_allclose(v.evaluate([True, False]), v_finite.evaluate([True, False]), rtol=0.01)
def test_evaluate_mrp(self): mrp_vf1: np.ndarray = self.implied_mrp.get_value_function_vec( self.gamma) # print({s: mrp_vf1[i] for i, s in enumerate(self.states)}) fa = Dynamic({s: 0.0 for s in self.states}) mrp_finite_fa = iterate.converged( evaluate_finite_mrp(self.implied_mrp, self.gamma, fa), done=lambda a, b: a.within(b, 1e-4), ) # print(mrp_finite_fa.values_map) mrp_vf2: np.ndarray = mrp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mrp_vf1 - mrp_vf2)), 0.001) mrp_fa = iterate.converged( evaluate_mrp( self.implied_mrp, self.gamma, fa, Choose(self.states), num_state_samples=30, ), done=lambda a, b: a.within(b, 0.1), ) # print(mrp_fa.values_map) mrp_vf3: np.ndarray = mrp_fa.evaluate(self.states) self.assertLess(max(abs(mrp_vf1 - mrp_vf3)), 1.0)
def evaluate_mrp_result( mrp: FiniteMarkovRewardProcess[S], gamma: float, approx_0: FunctionApprox[S], ) -> FunctionApprox[S]: v_star: np.ndarray = converged(evaluate_finite_mrp(mrp, gamma, approx_0), done=almost_equal_vf_approx) return v_star
def test_evaluate_finite_mrp(self): start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()}) v = FunctionApprox.converged( evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: self.assertLess(abs(v(s) - 170), 0.1)
def test_evaluate_finite_mrp(self): start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()}) v = iterate.converged( evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start), done=lambda a, b: a.within(b, 1e-4), ) self.assertEqual(len(v.values_map), 2) for s in v.values_map: self.assertLess(abs(v(s) - 170), 0.1)
def test_compare_to_backward_induction(self): finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10) start = Dynamic({s: 0.0 for s in finite_horizon.states()}) v = FunctionApprox.converged( evaluate_finite_mrp(finite_horizon, γ=1, approx_0=start)) self.assertEqual(len(v.values_map), 22) finite_v =\ list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1)) for time in range(0, 10): self.assertAlmostEqual(v(WithTime(state=True, time=time)), finite_v[time][True]) self.assertAlmostEqual(v(WithTime(state=False, time=time)), finite_v[time][False])