mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp() num_samples_val = 100 softmax_flag = False epsilon_val = 0.0 epsilon_half_life_val = 30 tol_val = 1e-4 fa_spec_val = FuncApproxSpec( state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv)) adp_obj = ADP(mdp_rep_for_adp=mdp_rep_obj, num_samples=num_samples_val, softmax=softmax_flag, epsilon=epsilon_val, epsilon_half_life=epsilon_half_life_val, tol=tol_val, fa_spec=fa_spec_val) def policy_func(i: int) -> Mapping[str, float]: if i == 1: ret = {'a': 0.4, 'b': 0.6} elif i == 2: ret = {'a': 0.7, 'c': 0.3}
discount_rate=rho) reinforce_val = True num_state_samples_val = 500 num_next_state_samples_val = 30 num_action_samples_val = 50 num_batches_val = 3000 actor_lambda_val = 0.99 critic_lambda_val = 0.99 actor_mu = FuncApproxSpec( state_feature_funcs=[], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.sigmoid, output_activation_deriv=DNNSpec.sigmoid_deriv)) actor_nu = FuncApproxSpec( state_feature_funcs=[], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[], hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.pos_log_squish, output_activation_deriv=DNNSpec.pos_log_squish_deriv)) actor_mean = FuncApproxSpec(state_feature_funcs=[], action_feature_funcs=[], dnn_spec=None) actor_variance = FuncApproxSpec( state_feature_funcs=[],
def get_rl_fa_price(self, num_dt: int, method: str, exploring_start: bool, algorithm: TDAlgorithm, softmax: bool, epsilon: float, epsilon_half_life: float, lambd: float, num_paths: int, batch_size: int, feature_funcs: Sequence[ Callable[[Tuple[StateType, ActionType]], float]], neurons: Optional[Sequence[int]], learning_rate: float, learning_rate_decay: float, adam: Tuple[bool, float, float], offline: bool) -> float: dt = self.expiry / num_dt def sa_func(_: StateType) -> Set[ActionType]: return {True, False} # noinspection PyShadowingNames def terminal_state(s: StateType, num_dt=num_dt) -> bool: return s[0] > num_dt # noinspection PyShadowingNames def sr_func(s: StateType, a: ActionType, num_dt=num_dt) -> Tuple[StateType, float]: return self.state_reward_gen(s, a, num_dt) def init_s() -> StateType: return 0, np.array([self.spot_price]) def init_sa() -> Tuple[StateType, ActionType]: return init_s(), choice([True, False]) # noinspection PyShadowingNames mdp_rep_obj = MDPRepForRLFA(state_action_func=sa_func, gamma=ALMOSTONEGAMMA, terminal_state_func=terminal_state, state_reward_gen_func=sr_func, init_state_gen=init_s, init_state_action_gen=init_sa) fa_spec = FuncApproxSpec( state_feature_funcs=[], sa_feature_funcs=feature_funcs, dnn_spec=(None if neurons is None else (DNNSpec( neurons=neurons, hidden_activation=DNNSpec.log_squish, hidden_activation_deriv=DNNSpec.log_squish_deriv, output_activation=DNNSpec.pos_log_squish, output_activation_deriv=DNNSpec.pos_log_squish_deriv))), learning_rate=learning_rate, adam_params=adam, add_unit_feature=False) if method == "MC": rl_fa_obj = MonteCarlo(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_paths, max_steps=num_dt + 2, fa_spec=fa_spec) elif method == "TD0": rl_fa_obj = TD0(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, algorithm=algorithm, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_paths, max_steps=num_dt + 2, fa_spec=fa_spec) elif method == "TDL": rl_fa_obj = TDLambda(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, algorithm=algorithm, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, lambd=lambd, num_episodes=num_paths, batch_size=batch_size, max_steps=num_dt + 2, fa_spec=fa_spec, offline=offline) elif method == "TDE": rl_fa_obj = TDLambdaExact(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, algorithm=algorithm, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, lambd=lambd, num_episodes=num_paths, batch_size=batch_size, max_steps=num_dt + 2, state_feature_funcs=[], sa_feature_funcs=feature_funcs, learning_rate=learning_rate, learning_rate_decay=learning_rate_decay) else: rl_fa_obj = LSPI(mdp_rep_for_rl=mdp_rep_obj, exploring_start=exploring_start, softmax=softmax, epsilon=epsilon, epsilon_half_life=epsilon_half_life, num_episodes=num_paths, batch_size=batch_size, max_steps=num_dt + 2, state_feature_funcs=[], sa_feature_funcs=feature_funcs) qvf = rl_fa_obj.get_qv_func_fa(None) # init_s = (0, np.array([self.spot_price])) # val_exec = qvf(init_s)(True) # val_cont = qvf(init_s)(False) # true_false_spot_max = max(val_exec, val_cont) all_paths = self.get_all_paths(0.0, num_paths, num_dt) prices = np.zeros(num_paths) for path_num, path in enumerate(all_paths): steps = 0 while steps <= num_dt: price_seq = path[:(steps + 1)] state = (steps, price_seq) exercise_price = np.exp(-self.ir(dt * steps)) *\ self.payoff(dt * steps, price_seq) continue_price = qvf(state)(False) steps += 1 if exercise_price > continue_price: prices[path_num] = exercise_price steps = num_dt + 1 # print(state) # print(exercise_price) # print(continue_price) # print(qvf(state)(True)) return np.average(prices)
num_state_samples_val = 100 num_next_state_samples_val = 25 num_action_samples_val = 20 num_batches_val = 100 max_steps_val = 100 actor_lambda_val = 0.95 critic_lambda_val = 0.95 vf_fa_spec_val = FuncApproxSpec( state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec(neurons=[2], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.identity, output_activation_deriv=DNNSpec.identity_deriv)) pol_fa_spec_val = [ FuncApproxSpec(state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=[3], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv, output_activation=DNNSpec.sigmoid, output_activation_deriv=DNNSpec.sigmoid_deriv)) ]
errors = np.array([x[-1][0] for x in all_fwd_prop]) - \ np.array(supervisory_seq) return get_generalized_back_prop( dnn_params=self.params, layer_inputs=layer_inputs, factors=errors, dObj_dSL=np.ones_like(errors), decay_param=gamma_lambda, hidden_activation_deriv=self.hidden_activation_deriv ) if __name__ == '__main__': this_dnn_obj = DNNSpec( neurons=[2], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv ) nn = DNN( feature_funcs=FuncApproxBase.get_identity_feature_funcs(3), dnn_obj=this_dnn_obj, reglr_coeff=0., learning_rate=1., adam=True, adam_decay1=0.9, adam_decay2=0.999 ) init_eval = nn.get_func_eval((2.0, 3.0, -4.0)) print(init_eval) x_pts = np.arange(-10.0, 10.0, 0.5)
num_samples_val = 100 softmax_flag = False epsilon_val = 0.0 epsilon_half_life_val = 30 tol_val = 1e-4 fa_spec_val = FuncApproxSpec( state_feature_funcs=[ lambda s: 1. if s == 1 else 0., lambda s: 1. if s == 2 else 0., lambda s: 1. if s == 3 else 0. ], action_feature_funcs=[], dnn_spec=DNNSpec( neurons=[2, 4], hidden_activation=DNNSpec.relu, hidden_activation_deriv=DNNSpec.relu_deriv ) ) adp_obj = ADP( mdp_rep_for_adp=mdp_rep_obj, num_samples=num_samples_val, softmax=softmax_flag, epsilon=epsilon_val, epsilon_half_life=epsilon_half_life_val, tol=tol_val, fa_spec=fa_spec_val ) def policy_func(i: int) -> Mapping[str, float]: if i == 1: