def get_vf_for_policy(
     self,
     policy: FinitePolicy[WithTime[int], int]
 ) -> Iterator[V[int]]:
     mrp: FiniteMarkovRewardProcess[WithTime[int]] \
         = self.mdp.apply_finite_policy(policy)
     return evaluate(unwrap_finite_horizon_MRP(mrp), 1.)
Example #2
0
    def test_evaluate_mrp(self):
        vf = evaluate(self.mrp_seq, 1.)
        states = self.single_step_mrp.states()
        fa_dynamic = Dynamic({s: 0.0 for s in states})
        fa_tabular = Tabular()
        distribution = Choose(set(states))
        approx_vf_finite = backward_evaluate_finite(
            [(self.mrp_seq[i], fa_dynamic) for i in range(self.steps)],
            1.
        )
        approx_vf = backward_evaluate(
            [(self.single_step_mrp, fa_tabular, distribution)
             for _ in range(self.steps)],
            1.,
            num_state_samples=120,
            error_tolerance=0.01
        )

        for t, (v1, v2, v3) in enumerate(zip(
                vf,
                approx_vf_finite,
                approx_vf
        )):
            states = self.mrp_seq[t].keys()
            v1_arr = np.array([v1[s] for s in states])
            v2_arr = v2.evaluate(states)
            v3_arr = v3.evaluate(states)
            self.assertLess(max(abs(v1_arr - v2_arr)), 0.001)
            self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
Example #3
0
    def test_evaluate(self):
        process = finite_horizon_MRP(self.finite_flip_flop, 10)
        vs = list(evaluate(unwrap_finite_horizon_MRP(process), gamma=1))

        self.assertEqual(len(vs), 10)

        self.assertAlmostEqual(vs[0][NonTerminal(True)], 17)
        self.assertAlmostEqual(vs[0][NonTerminal(False)], 17)

        self.assertAlmostEqual(vs[5][NonTerminal(True)], 17 / 2)
        self.assertAlmostEqual(vs[5][NonTerminal(False)], 17 / 2)

        self.assertAlmostEqual(vs[9][NonTerminal(True)], 17 / 10)
        self.assertAlmostEqual(vs[9][NonTerminal(False)], 17 / 10)
    def test_compare_to_backward_induction(self):
        finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10)

        v = evaluate_mrp_result(finite_horizon, gamma=1)
        self.assertEqual(len(v), 20)

        finite_v =\
            list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1))

        for time in range(0, 10):
            self.assertAlmostEqual(v[WithTime(state=True, time=time)],
                                   finite_v[time][True])
            self.assertAlmostEqual(v[WithTime(state=False, time=time)],
                                   finite_v[time][False])
    def test_compare_to_backward_induction(self):
        finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10)

        start = Dynamic({s: 0.0 for s in finite_horizon.states()})
        v = FunctionApprox.converged(
            evaluate_finite_mrp(finite_horizon, γ=1, approx_0=start))
        self.assertEqual(len(v.values_map), 22)

        finite_v =\
            list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1))

        for time in range(0, 10):
            self.assertAlmostEqual(v(WithTime(state=True, time=time)),
                                   finite_v[time][True])
            self.assertAlmostEqual(v(WithTime(state=False, time=time)),
                                   finite_v[time][False])
    print("Clearance Pricing MDP")
    print("---------------------")
    print(cp.mdp)

    def policy_func(x: int) -> int:
        return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3))

    stationary_policy: FinitePolicy[int, int] = FinitePolicy(
        {s: Constant(policy_func(s)) for s in range(ii + 1)}
    )

    single_step_mrp: FiniteMarkovRewardProcess[int] = \
        cp.single_step_mdp.apply_finite_policy(stationary_policy)

    vf_for_policy: Iterator[V[int]] = evaluate(
        unwrap_finite_horizon_MRP(finite_horizon_MRP(single_step_mrp, steps)),
        1.
    )

    print("Value Function for Stationary Policy")
    print("------------------------------------")
    for t, vf in enumerate(vf_for_policy):
        print(f"Time Step {t:d}")
        print("---------------")
        pprint(vf)

    print("Optimal Value Function and Optimal Policy")
    print("------------------------------------")
    prices = []
    for t, (vf, policy) in enumerate(cp.get_optimal_vf_and_policy()):
        print(f"Time Step {t:d}")
        print("---------------")