def test_finite_horizon_MRP(self): finite = finite_horizon_MRP(self.finite_flip_flop, 10) trues = [NonTerminal(WithTime(True, time)) for time in range(10)] falses = [NonTerminal(WithTime(False, time)) for time in range(10)] non_terminal_states = set(trues + falses) self.assertEqual(set(finite.non_terminal_states), non_terminal_states) expected_transition = {} for state in non_terminal_states: t: int = state.state.time st: bool = state.state.state if t < 9: prob = { (NonTerminal(WithTime(st, t + 1)), 1.0): 0.3, (NonTerminal(WithTime(not st, t + 1)), 2.0): 0.7 } else: prob = { (Terminal(WithTime(st, t + 1)), 1.0): 0.3, (Terminal(WithTime(not st, t + 1)), 2.0): 0.7 } expected_transition[state] = Categorical(prob) for state in non_terminal_states: distribution.assert_almost_equal( self, finite.transition_reward(state), expected_transition[state])
def test_unwrap_finite_horizon_MDP(self): finite = finite_horizon_MDP(self.finite_flip_flop, 10) unwrapped = unwrap_finite_horizon_MDP(finite) self.assertEqual(len(unwrapped), 10) def action_mapping_for(s: WithTime[bool]) -> \ ActionMapping[bool, WithTime[bool]]: same = NonTerminal(s.step_time()) different = NonTerminal(dataclasses.replace( s.step_time(), state=not s.state )) return { True: Categorical({ (same, 1.0): 0.7, (different, 2.0): 0.3 }), False: Categorical({ (same, 1.0): 0.3, (different, 2.0): 0.7 }) } for t in range(9): for s in True, False: s_time = WithTime(state=s, time=t) for a in True, False: distribution.assert_almost_equal( self, finite.mapping[NonTerminal(s_time)][a], action_mapping_for(s_time)[a] ) for s in True, False: s_time = WithTime(state=s, time=9) same = Terminal(s_time.step_time()) different = Terminal(dataclasses.replace( s_time.step_time(), state=not s_time.state )) act_map = { True: Categorical({ (same, 1.0): 0.7, (different, 2.0): 0.3 }), False: Categorical({ (same, 1.0): 0.3, (different, 2.0): 0.7 }) } for a in True, False: distribution.assert_almost_equal( self, finite.mapping[NonTerminal(s_time)][a], act_map[a] )
def test_optimal_policy(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) steps = unwrap_finite_horizon_MDP(finite) *v_ps, (_, p) = optimal_vf_and_policy(steps, gamma=1) for _, a in p.action_for.items(): self.assertEqual(a, False) self.assertAlmostEqual(v_ps[0][0][NonTerminal(True)], 17) self.assertAlmostEqual(v_ps[5][0][NonTerminal(False)], 17 / 2)
def transition_for(_): return { True: Categorical({ (NonTerminal(True), 1.0): 0.3, (NonTerminal(False), 2.0): 0.7 }), False: Categorical({ (NonTerminal(True), 2.0): 0.7, (NonTerminal(False), 1.0): 0.3 }) }
def test_finite_horizon_MDP(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) self.assertEqual(len(finite.non_terminal_states), 20) for s in finite.non_terminal_states: self.assertEqual(set(finite.actions(s)), {False, True}) start = NonTerminal(WithTime(state=True, time=0)) result = finite.mapping[start][False] expected_result = Categorical({ (NonTerminal(WithTime(False, time=1)), 2.0): 0.7, (NonTerminal(WithTime(True, time=1)), 1.0): 0.3 }) distribution.assert_almost_equal(self, result, expected_result)
def states_sampler_func() -> NonTerminal[float]: start: float = np.random.lognormal(log_mean, log_stdev) price = np.exp( np.random.normal( np.log(start) + (self.rate - self.vol * self.vol / 2) * time, self.vol * np.sqrt(time))) return NonTerminal(price)
def test_unwrap_finite_horizon_MRP(self): finite = finite_horizon_MRP(self.finite_flip_flop, 10) def transition_for(_): return { True: Categorical({ (NonTerminal(True), 1.0): 0.3, (NonTerminal(False), 2.0): 0.7 }), False: Categorical({ (NonTerminal(True), 2.0): 0.7, (NonTerminal(False), 1.0): 0.3 }) } unwrapped = unwrap_finite_horizon_MRP(finite) self.assertEqual(len(unwrapped), 10) expected_transitions = [transition_for(n) for n in range(10)] for time in range(9): got = unwrapped[time] expected = expected_transitions[time] distribution.assert_almost_equal( self, got[NonTerminal(True)], expected[True] ) distribution.assert_almost_equal( self, got[NonTerminal(False)], expected[False] ) distribution.assert_almost_equal( self, unwrapped[9][NonTerminal(True)], Categorical({ (Terminal(True), 1.0): 0.3, (Terminal(False), 2.0): 0.7 }) ) distribution.assert_almost_equal( self, unwrapped[9][NonTerminal(False)], Categorical({ (Terminal(True), 2.0): 0.7, (Terminal(False), 1.0): 0.3 }) )
def states_sampler_func() -> NonTerminal[PriceAndShares]: price: float = self.initial_price_distribution.sample() rem: int = self.shares for i in range(t): sell: int = Choose(range(rem + 1)).sample() price = self.price_dynamics[i](PriceAndShares( price=price, shares=rem)).sample() rem -= sell return NonTerminal(PriceAndShares(price=price, shares=rem))
def action_mapping_for(s: WithTime[bool]) -> \ ActionMapping[bool, WithTime[bool]]: same = NonTerminal(s.step_time()) different = NonTerminal(dataclasses.replace( s.step_time(), state=not s.state )) return { True: Categorical({ (same, 1.0): 0.7, (different, 2.0): 0.3 }), False: Categorical({ (same, 1.0): 0.3, (different, 2.0): 0.7 }) }
def states_sampler_func() -> NonTerminal[float]: wealth: float = self.initial_wealth_distribution.sample() for i in range(t): distr: Distribution[float] = self.risky_return_distributions[i] rate: float = self.riskless_returns[i] alloc: float = actions_distr.sample() wealth = alloc * (1 + distr.sample()) + \ (wealth - alloc) * (1 + rate) return NonTerminal(wealth)
def sr_sampler_func(wealth=wealth, alloc=alloc) -> Tuple[State[float], float]: next_wealth: float = alloc * (1 + distr.sample()) \ + (wealth.state - alloc) * (1 + rate) reward: float = utility_f(next_wealth) \ if t == steps - 1 else 0. next_state: State[float] = Terminal(next_wealth) \ if t == steps - 1 else NonTerminal(next_wealth) return (next_state, reward)
def sr_sampler_func(price=price, exer=exer) -> Tuple[State[float], float]: if exer: return Terminal(0.), exer_payoff(price.state) else: next_price: float = np.exp( np.random.normal( np.log(price.state) + (r - s * s / 2) * dt, s * np.sqrt(dt))) return NonTerminal(next_price), 0.
def get_opt_vf_and_policy(self) -> \ Iterator[Tuple[V[int], FiniteDeterministicPolicy[int, bool]]]: dt: float = self.dt() up_factor: float = np.exp(self.vol * np.sqrt(dt)) up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \ (up_factor * up_factor - 1) return optimal_vf_and_policy(steps=[{ NonTerminal(j): { True: Constant( (Terminal(-1), self.payoff(i * dt, self.state_price(i, j)))), False: Categorical({ (NonTerminal(j + 1), 0.): up_prob, (NonTerminal(j), 0.): 1 - up_prob }) } for j in range(i + 1) } for i in range(self.num_steps + 1)], gamma=np.exp(-self.rate * dt))
def sr_sampler_func( p_r=p_r, sell=sell) -> Tuple[State[PriceAndShares], float]: p_s: PriceAndShares = PriceAndShares(price=p_r.state.price, shares=sell) next_price: float = dynamics[t](p_s).sample() next_rem: int = p_r.state.shares - sell next_state: PriceAndShares = PriceAndShares( price=next_price, shares=next_rem) reward: float = utility_f( sell * (p_r.state.price - price_diff[t](p_s))) return (NonTerminal(next_state), reward)
def sr_sampler_func( state=state, action=action) -> Tuple[State[AssetAllocState], float]: time, wealth = state.state next_wealth: float = action * (1 + distrs[time].sample()) \ + (wealth - action) * (1 + rates[time]) reward: float = utility_f(next_wealth) \ if time == steps - 1 else 0. next_pair: AssetAllocState = (time + 1, next_wealth) next_state: State[AssetAllocState] = \ Terminal(next_pair) if time == steps - 1 \ else NonTerminal(next_pair) return (next_state, reward)
def test_evaluate(self): process = finite_horizon_MRP(self.finite_flip_flop, 10) vs = list(evaluate(unwrap_finite_horizon_MRP(process), gamma=1)) self.assertEqual(len(vs), 10) self.assertAlmostEqual(vs[0][NonTerminal(True)], 17) self.assertAlmostEqual(vs[0][NonTerminal(False)], 17) self.assertAlmostEqual(vs[5][NonTerminal(True)], 17 / 2) self.assertAlmostEqual(vs[5][NonTerminal(False)], 17 / 2) self.assertAlmostEqual(vs[9][NonTerminal(True)], 17 / 10) self.assertAlmostEqual(vs[9][NonTerminal(False)], 17 / 10)
def explore(s: S, mdp=mdp) -> Iterable[A]: return mdp.actions(NonTerminal(s))
opt_payoff = lambda _, x: max(x - strike, 0) else: opt_payoff = lambda _, x: max(strike - x, 0) opt_ex_bin_tree: OptimalExerciseBinTree = OptimalExerciseBinTree( spot_price=spot_price_val, payoff=opt_payoff, expiry=expiry_val, rate=rate_val, vol=vol_val, num_steps=num_steps_val) vf_seq, policy_seq = zip(*opt_ex_bin_tree.get_opt_vf_and_policy()) ex_boundary: Sequence[Tuple[float, float]] = \ opt_ex_bin_tree.option_exercise_boundary(policy_seq, is_call) time_pts, ex_bound_pts = zip(*ex_boundary) label = ("Call" if is_call else "Put") + " Option Exercise Boundary" plot_list_of_curves(list_of_x_vals=[time_pts], list_of_y_vals=[ex_bound_pts], list_of_colors=["b"], list_of_curve_labels=[label], x_label="Time", y_label="Underlying Price", title=label) european: float = opt_ex_bin_tree.european_price(is_call, strike) print(f"European Price = {european:.3f}") am_price: float = vf_seq[0][NonTerminal(0)] print(f"American Price = {am_price:.3f}")
def start_states_distribution_func() -> NonTerminal[AssetAllocState]: wealth: float = self.initial_wealth_distribution.sample() return NonTerminal((0, wealth))
q_feature_functions=q_ffs, q_dnn_spec=dnn_qvf_spec, v_feature_functions=v_ffs, v_dnn_spec=dnn_vf_spec ) actor_critic_error_policies: Iterator[FunctionApprox[ NonTerminal[AssetAllocState]]] = aad.actor_critic_td_error( feature_functions=v_ffs, q_value_dnn_spec=dnn_vf_spec ) num_episodes: int = 20000 x: Sequence[int] = range(num_episodes) y0: Sequence[float] = [base_alloc * (1 + r) ** (1 - steps)] * num_episodes y1: Sequence[float] = [p(NonTerminal((init_wealth, 0))) for p in itertools.islice(reinforce_policies, num_episodes)] y2: Sequence[float] = [p(NonTerminal((init_wealth, 0))) for p in itertools.islice( actor_critic_policies, 0, num_episodes * steps, steps )] y3: Sequence[float] = [p(NonTerminal((init_wealth, 0))) for p in itertools.islice( actor_critic_adv_policies, 0, num_episodes * steps, steps )]
def optimal_action(s: S) -> A: _, a = q.argmax((NonTerminal(s), a) for a in actions(NonTerminal(s))) return a
func_approx=fa, initial_price_distribution=init_price_distrib) it_vf: Iterator[Tuple[ValueFunctionApprox[PriceAndShares], DeterministicPolicy[PriceAndShares, int]]] = \ ooe.backward_induction_vf_and_pi() state: PriceAndShares = PriceAndShares(price=init_price_mean, shares=num_shares) print("Backward Induction: VF And Policy") print("---------------------------------") print() for t, (vf, pol) in enumerate(it_vf): print(f"Time {t:d}") print() opt_sale: int = pol.action_for(state) val: float = vf(NonTerminal(state)) print(f"Optimal Sales = {opt_sale:d}, Opt Val = {val:.3f}") print() print("Optimal Weights below:") print(vf.weights.weights) print() print("Analytical Solution") print("-------------------") print() for t in range(num_time_steps): print(f"Time {t:d}") print() left: int = num_time_steps - t opt_sale_anal: float = num_shares / num_time_steps
# print("Weights") # for w in v.weights: # print(w.weights) # print() it_qvf: Iterator[QValueFunctionApprox[float, float]] = \ aad.backward_induction_qvf() print("Backward Induction on Q-Value Function") print("--------------------------------------") print() for t, q in enumerate(it_qvf): print(f"Time {t:d}") print() opt_alloc: float = max( ((q((NonTerminal(init_wealth), ac)), ac) for ac in alloc_choices), key=itemgetter(0) )[1] val: float = max(q((NonTerminal(init_wealth), ac)) for ac in alloc_choices) print(f"Opt Risky Allocation = {opt_alloc:.3f}, Opt Val = {val:.3f}") print("Optimal Weights below:") for wts in q.weights: pprint(wts.weights) print() print("Analytical Solution") print("-------------------") print() for t in range(steps):
def optimal_value_curve(self, func: FunctionApprox[NonTerminal[float]], prices: Sequence[float]) -> np.ndarray: return func.evaluate([NonTerminal(p) for p in prices])
# print("Weights") # for w in v.weights: # print(w.weights) # print() it_qvf: Iterator[QValueFunctionApprox[float, float]] = \ aad.backward_induction_qvf() print("Backward Induction on Q-Value Function") print("--------------------------------------") print() for t, q in enumerate(it_qvf): print(f"Time {t:d}") print() opt_alloc: float = max( ((q((NonTerminal(init_wealth), ac)), ac) for ac in alloc_choices), key=itemgetter(0))[1] val: float = max( q((NonTerminal(init_wealth), ac)) for ac in alloc_choices) print(f"Opt Risky Allocation = {opt_alloc:.3f}, Opt Val = {val:.3f}") print("Optimal Weights below:") for wts in q.weights: pprint(wts.weights) print() print("Analytical Solution") print("-------------------") print() for t in range(steps): print(f"Time {t:d}")
for t, (v, p) in enumerate(it_vf): print(f"Time {t:d}") print() if t == 0 or t == int(num_steps_val / 2) or t == num_steps_val - 1: exer_curve: np.ndarray = opt_ex_bi.exercise_curve(prices=prices) opt_val_curve: np.ndarray = opt_ex_bi.optimal_value_curve( func=v, prices=prices) plt.plot(prices, opt_val_curve, "r", prices, exer_curve, "b") time: float = t * expiry_val / num_steps_val plt.title(f"OptVal and Exercise Curves for Time = {time:.3f}") plt.show() all_funcs.append(v) opt_alloc: float = p.action_for(spot_price_val) val: float = v(NonTerminal(spot_price_val)) print(f"Opt Action = {opt_alloc}, Opt Val = {val:.3f}") print() ex_bound: Sequence[float] = opt_ex_bi.put_option_exercise_boundary( all_funcs, strike) plt.plot(range(num_steps_val + 1), ex_bound) plt.title("Exercise Boundary") plt.show() print("European Put Price") print("------------------") print() print(opt_ex_bi.european_put_price(strike=strike))
actor_critic_adv_policies: Iterator[FunctionApprox[ NonTerminal[AssetAllocState]]] = aad.actor_critic_advantage( q_feature_functions=q_ffs, q_dnn_spec=dnn_qvf_spec, v_feature_functions=v_ffs, v_dnn_spec=dnn_vf_spec) actor_critic_error_policies: Iterator[FunctionApprox[ NonTerminal[AssetAllocState]]] = aad.actor_critic_td_error( feature_functions=v_ffs, q_value_dnn_spec=dnn_vf_spec) num_episodes: int = 50000 x: Sequence[int] = range(num_episodes) y0: Sequence[float] = [base_alloc * (1 + r)**(1 - steps)] * num_episodes y1: Sequence[float] = [ p(NonTerminal((init_wealth, 0))) for p in itertools.islice(reinforce_policies, num_episodes) ] y2: Sequence[float] = [ p(NonTerminal((init_wealth, 0))) for p in itertools.islice(actor_critic_policies, 0, num_episodes * steps, steps) ] y3: Sequence[float] = [ p(NonTerminal((init_wealth, 0))) for p in itertools.islice(actor_critic_adv_policies, 0, num_episodes * steps, steps) ] y4: Sequence[float] = [ p(NonTerminal((init_wealth, 0))) for p in itertools.islice( actor_critic_error_policies, 0, num_episodes * steps, steps)