def test_unwrap_finite_horizon_MDP(self): finite = finite_horizon_MDP(self.finite_flip_flop, 10) unwrapped = unwrap_finite_horizon_MDP(finite) self.assertEqual(len(unwrapped), 10) def action_mapping_for( s: WithTime[bool]) -> ActionMapping[bool, WithTime[bool]]: same = s.step_time() different = dataclasses.replace(s.step_time(), state=not s.state) return { True: Categorical({ (same, 1.0): 0.7, (different, 2.0): 0.3 }), False: Categorical({ (same, 1.0): 0.3, (different, 2.0): 0.7 }) } for t in range(0, 10): for s in True, False: s_time = WithTime(state=s, time=t) for a in True, False: distribution.assert_almost_equal( self, finite.action_mapping(s_time)[a], action_mapping_for(s_time)[a]) self.assertEqual(finite.action_mapping(WithTime(state=True, time=10)), None)
def setUp(self): ii = 10 self.steps = 6 pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)] self.cp: ClearancePricingMDP = ClearancePricingMDP( initial_inventory=ii, time_steps=self.steps, price_lambda_pairs=pairs) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FiniteDeterministicPolicy[int, int] = \ FiniteDeterministicPolicy( {s: policy_func(s) for s in range(ii + 1)} ) self.single_step_mrp: FiniteMarkovRewardProcess[int] = \ self.cp.single_step_mdp.apply_finite_policy(stationary_policy) self.mrp_seq = unwrap_finite_horizon_MRP( finite_horizon_MRP(self.single_step_mrp, self.steps)) self.single_step_mdp: FiniteMarkovDecisionProcess[int, int] = \ self.cp.single_step_mdp self.mdp_seq = unwrap_finite_horizon_MDP( finite_horizon_MDP(self.single_step_mdp, self.steps))
def test_optimal_policy(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) steps = unwrap_finite_horizon_MDP(finite) *v_ps, (_, p) = optimal_vf_and_policy(steps, gamma=1) for _, a in p.action_for.items(): self.assertEqual(a, False) self.assertAlmostEqual(v_ps[0][0][NonTerminal(True)], 17) self.assertAlmostEqual(v_ps[5][0][NonTerminal(False)], 17 / 2)
def test_optimal_policy(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) steps = unwrap_finite_horizon_MDP(finite) *v_ps, (v, p) = optimal_vf_and_policy(steps, gamma=1) for s in p.states(): self.assertEqual(p.act(s), Constant(False)) self.assertAlmostEqual(v_ps[0][0][True], 17) self.assertAlmostEqual(v_ps[5][0][False], 17 / 2)
def test_finite_horizon_MDP(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) self.assertEqual(len(finite.non_terminal_states), 20) for s in finite.non_terminal_states: self.assertEqual(set(finite.actions(s)), {False, True}) start = NonTerminal(WithTime(state=True, time=0)) result = finite.mapping[start][False] expected_result = Categorical({ (NonTerminal(WithTime(False, time=1)), 2.0): 0.7, (NonTerminal(WithTime(True, time=1)), 1.0): 0.3 }) distribution.assert_almost_equal(self, result, expected_result)
def __init__(self, initial_inventory: int, time_steps: int, price_lambda_pairs: Sequence[Tuple[float, float]]): self.initial_inventory = initial_inventory self.time_steps = time_steps self.price_lambda_pairs = price_lambda_pairs distrs = [poisson(l) for _, l in price_lambda_pairs] prices = [p for p, _ in price_lambda_pairs] self.single_step_mdp: FiniteMarkovDecisionProcess[int, int] =\ FiniteMarkovDecisionProcess({ s: {i: Categorical( {(s - k, prices[i] * k): (distrs[i].pmf(k) if k < s else 1 - distrs[i].cdf(s - 1)) for k in range(s + 1)}) for i in range(len(prices))} for s in range(initial_inventory + 1) }) self.mdp = finite_horizon_MDP(self.single_step_mdp, time_steps)
def test_finite_horizon_MDP(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) self.assertEqual(len(finite.states()), 22) for s in finite.states(): if len(set(finite.actions(s))) > 0: self.assertEqual(set(finite.actions(s)), {False, True}) start = WithTime(state=True, time=0) result = finite.action_mapping(start)[False] expected_result = Categorical({ (WithTime(False, time=1), 2.0): 0.7, (WithTime(True, time=1), 1.0): 0.3 }) distribution.assert_almost_equal(self, result, expected_result) self.assertEqual(finite.step(WithTime(True, 10), True), None)