Esempio n. 1
0
    def __init__(
        self,
        mdp_rep_for_rl_pg: MDPRepForRLPG,
        reinforce: bool,
        batch_size: int,
        num_batches: int,
        num_action_samples: int,
        max_steps: int,
        actor_lambda: float,
        critic_lambda: float,
        score_func: Callable[[A, Sequence[float]], Sequence[float]],
        sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]],
        fa_spec: FuncApproxSpec,
        pol_fa_spec: Sequence[FuncApproxSpec]

    ) -> None:
        self.mdp_rep: MDPRepForRLPG = mdp_rep_for_rl_pg
        self.reinforce: bool = reinforce
        self.batch_size: int = batch_size
        self.num_batches: int = num_batches
        self.num_action_samples: int = num_action_samples
        self.max_steps: int = max_steps
        self.actor_lambda: float = actor_lambda
        self.critic_lambda: float = critic_lambda
        self.score_func: Callable[[A, Sequence[float]], Sequence[float]] =\
            score_func
        self.sample_actions_gen_func: Callable[[Sequence[float], int], Sequence[A]] =\
            sample_actions_gen_func
        self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj()
        self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj()
        self.pol_fa: Sequence[FuncApproxBase] =\
            [s.get_vf_func_approx_obj() for s in pol_fa_spec]
Esempio n. 2
0
 def actor_spec(self) -> Tuple[FuncApproxSpec, FuncApproxSpec]:
     ff = lambda s: (1. + self.r)**float(s[0])
     mean = FuncApproxSpec(state_feature_funcs=[ff],
                           sa_feature_funcs=[lambda x, ff=ff: ff(x[0])],
                           dnn_spec=None)
     variance = FuncApproxSpec(
         state_feature_funcs=[],
         sa_feature_funcs=[],
         dnn_spec=DNNSpec(
             neurons=[],
             hidden_activation=DNNSpec.log_squish,
             hidden_activation_deriv=DNNSpec.log_squish_deriv,
             output_activation=DNNSpec.pos_log_squish,
             output_activation_deriv=DNNSpec.pos_log_squish_deriv))
     return mean, variance
Esempio n. 3
0
    def __init__(self, mdp_rep_for_rl: MDPRepForRLFA, exploring_start: bool,
                 softmax: bool, epsilon: float, epsilon_half_life: float,
                 num_episodes: int, max_steps: int,
                 fa_spec: FuncApproxSpec) -> None:

        self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl
        self.exploring_start: bool = exploring_start
        self.softmax: bool = softmax
        self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func(
            epsilon, epsilon_half_life)
        self.num_episodes: int = num_episodes
        self.max_steps: int = max_steps
        self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj()
        self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj()
        self.state_action_func = self.mdp_rep.state_action_func
Esempio n. 4
0
 def __init__(self, mdp_rep_for_rl: MDPRepForRLFA, exploring_start: bool,
              algorithm: TDAlgorithm, softmax: bool, epsilon: float,
              epsilon_half_life: float, lambd: float, num_episodes: int,
              batch_size: int, max_steps: int,
              state_feature_funcs: Sequence[Callable[[S], float]],
              sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]],
              learning_rate: float, learning_rate_decay: float) -> None:
     super().__init__(mdp_rep_for_rl=mdp_rep_for_rl,
                      exploring_start=exploring_start,
                      softmax=softmax,
                      epsilon=epsilon,
                      epsilon_half_life=epsilon_half_life,
                      num_episodes=num_episodes,
                      max_steps=max_steps,
                      fa_spec=FuncApproxSpec(
                          state_feature_funcs=state_feature_funcs,
                          sa_feature_funcs=sa_feature_funcs,
                          dnn_spec=None,
                          learning_rate=learning_rate,
                          add_unit_feature=False))
     self.vf_w: np.ndarray = np.zeros(self.vf_fa.num_features)
     self.qvf_w: np.ndarray = np.zeros(self.qvf_fa.num_features)
     self.vf_fa.params = [self.vf_w]
     self.qvf_fa.params = [self.qvf_w]
     self.algorithm: TDAlgorithm = algorithm
     self.gamma_lambda: float = self.mdp_rep.gamma * lambd
     self.batch_size: int = batch_size
     self.learning_rate_decay: float = learning_rate_decay
Esempio n. 5
0
    def __init__(
        self,
        mdp_rep_for_rl: MDPRepForRLFA,
        exploring_start: bool,
        softmax: bool,
        epsilon: float,
        epsilon_half_life: float,
        num_episodes: int,
        batch_size: int,
        max_steps: int,
        state_feature_funcs: Sequence[Callable[[S], float]],
        sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]]
    ) -> None:

        super().__init__(
            mdp_rep_for_rl=mdp_rep_for_rl,
            exploring_start=exploring_start,
            softmax=softmax,
            epsilon=epsilon,
            epsilon_half_life=epsilon_half_life,
            num_episodes=num_episodes,
            max_steps=max_steps,
            fa_spec=FuncApproxSpec(
                state_feature_funcs=state_feature_funcs,
                sa_feature_funcs=sa_feature_funcs,
                dnn_spec=None,
                reglr_coeff=0.,
                learning_rate=0.,
                adam_params=(False, 0., 0.),
                add_unit_feature=True
            )
        )
        self.batch_size: int = batch_size
Esempio n. 6
0
 def __init__(self, mdp_rep_for_adp: MDPRepForADP, num_samples: int,
              softmax: bool, epsilon: float, epsilon_half_life: float,
              tol: float, fa_spec: FuncApproxSpec) -> None:
     self.mdp_rep: MDPRepForADP = mdp_rep_for_adp
     self.num_samples: int = num_samples
     self.softmax: bool = softmax
     self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func(
         epsilon, epsilon_half_life)
     self.tol: float = tol
     self.fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj()
     self.state_action_func: Callable[[S], Set[A]] =\
         self.mdp_rep.state_action_func
Esempio n. 7
0
 def get_actor_nu_spec() -> FuncApproxSpec:
     return FuncApproxSpec(
         state_feature_funcs=[],
         sa_feature_funcs=[],
         dnn_spec=DNNSpec(
             neurons=[],
             hidden_activation=DNNSpec.log_squish,
             hidden_activation_deriv=DNNSpec.log_squish_deriv,
             output_activation=DNNSpec.pos_log_squish,
             output_activation_deriv=DNNSpec.pos_log_squish_deriv
         )
     )
Esempio n. 8
0
    def critic_spec(self, neurons: Sequence[int]) -> FuncApproxSpec:
        def feature_func(state: StateType) -> float:
            t = float(state[0])
            # noinspection PyPep8Naming
            W = state[1]
            term1 = self.rho**(-t)
            term2 = np.exp((self.mu - self.r)**2 / (2 * self.sigma**2) * t)
            term3 = np.exp(-self.gamma * (1. + self.r)**(self.time_steps - t) *
                           W)
            return term1 * term2 * term3

        return FuncApproxSpec(
            state_feature_funcs=[feature_func],
            sa_feature_funcs=[
                lambda x, feature_func=feature_func: feature_func(x[0])
            ],
            dnn_spec=DNNSpec(neurons=neurons,
                             hidden_activation=DNNSpec.relu,
                             hidden_activation_deriv=DNNSpec.relu_deriv,
                             output_activation=DNNSpec.identity,
                             output_activation_deriv=DNNSpec.identity_deriv))
Esempio n. 9
0
    def get_actor_mu_spec(self, time_steps: int) -> FuncApproxSpec:
        tnu = self.get_nu()

        # noinspection PyShadowingNames
        def state_ff(state: Tuple[int, float], tnu=tnu) -> float:
            tte = self.expiry * (1. - float(state[0]) / time_steps)
            if tnu == 0:
                ret = 1. / (tte + self.epsilon)
            else:
                ret = tnu / (1. + (tnu * self.epsilon - 1.) * np.exp(-tnu * tte))
            return ret

        return FuncApproxSpec(
            state_feature_funcs=[state_ff],
            sa_feature_funcs=[lambda x, state_ff=state_ff: state_ff(x[0])],
            dnn_spec=DNNSpec(
                neurons=[],
                hidden_activation=DNNSpec.log_squish,
                hidden_activation_deriv=DNNSpec.log_squish_deriv,
                output_activation=DNNSpec.sigmoid,
                output_activation_deriv=DNNSpec.sigmoid_deriv
            )
        )
Esempio n. 10
0
    def get_critic_spec(self, time_steps: int) -> FuncApproxSpec:
        tnu = self.get_nu()
        gam = 1. - self.gamma

        # noinspection PyShadowingNames
        def state_ff(
            state: Tuple[int, float],
            tnu=tnu,
            gam=gam
        ) -> float:
            t = float(state[0]) * self.expiry / time_steps
            tte = self.expiry - t
            if tnu == 0:
                ret = tte + self.epsilon
            else:
                ret = (1. + (tnu * self.epsilon - 1.) * np.exp(-tnu * tte)) / tnu
            mult = state[1] ** gam / gam if gam != 0 else np.log(state[1])
            return ret ** self.gamma * mult / np.exp(self.rho * t)

        return FuncApproxSpec(
            state_feature_funcs=[state_ff],
            sa_feature_funcs=[lambda x, state_ff=state_ff: state_ff(x[0])],
            dnn_spec=None
        )
Esempio n. 11
0
 num_action_samples_val = 100
 max_steps_val = 100
 actor_lambda_val = 0.95
 critic_lambda_val = 0.95
 learning_rate_val = 0.1
 state_ff = [
     lambda s: 1. if s == 1 else 0.,
     lambda s: 1. if s == 2 else 0.,
     lambda s: 1. if s == 3 else 0.
 ]
 fa_spec_val = FuncApproxSpec(
     state_feature_funcs=state_ff,
     sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff],
     dnn_spec=DNNSpec(
         neurons=[2],
         hidden_activation=DNNSpec.relu,
         hidden_activation_deriv=DNNSpec.relu_deriv,
         output_activation=DNNSpec.identity,
         output_activation_deriv=DNNSpec.identity_deriv
     ),
     learning_rate=learning_rate_val
 )
 pol_fa_spec_val = [FuncApproxSpec(
     state_feature_funcs=state_ff,
     sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff],
     dnn_spec=DNNSpec(
         neurons=[2],
         hidden_activation=DNNSpec.relu,
         hidden_activation_deriv=DNNSpec.relu_deriv,
         output_activation=DNNSpec.sigmoid,
         output_activation_deriv=DNNSpec.sigmoid_deriv
     ),
Esempio n. 12
0
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_adp()

    num_samples_val = 100
    softmax_flag = False
    epsilon_val = 0.0
    epsilon_half_life_val = 30
    tol_val = 1e-4
    state_ff = [
        lambda s: 1. if s == 1 else 0., lambda s: 1.
        if s == 2 else 0., lambda s: 1. if s == 3 else 0.
    ]
    fa_spec_val = FuncApproxSpec(
        state_feature_funcs=state_ff,
        sa_feature_funcs=[(lambda x, f=f: f(x[0])) for f in state_ff],
        dnn_spec=DNNSpec(neurons=[2, 4],
                         hidden_activation=DNNSpec.relu,
                         hidden_activation_deriv=DNNSpec.relu_deriv,
                         output_activation=DNNSpec.identity,
                         output_activation_deriv=DNNSpec.identity_deriv))
    adp_obj = ADP(mdp_rep_for_adp=mdp_rep_obj,
                  num_samples=num_samples_val,
                  softmax=softmax_flag,
                  epsilon=epsilon_val,
                  epsilon_half_life=epsilon_half_life_val,
                  tol=tol_val,
                  fa_spec=fa_spec_val)

    def policy_func(i: int) -> Mapping[str, float]:
        if i == 1:
            ret = {'a': 0.4, 'b': 0.6}
        elif i == 2:
Esempio n. 13
0
    this_epsilon = 0.05
    this_epsilon_half_life = 30
    this_learning_rate = 0.1
    this_learning_rate_decay = 1e6
    this_lambd = 0.8
    this_num_episodes = 3000
    this_max_steps = 1000
    this_tdl_fa_offline = True
    state_ffs = FuncApproxBase.get_identity_feature_funcs(ic.lead_time + 1)
    sa_ffs = [(lambda x, f=f: f(x[0])) for f in state_ffs] + [lambda x: x[1]]
    this_fa_spec = FuncApproxSpec(
        state_feature_funcs=state_ffs,
        sa_feature_funcs=sa_ffs,
        dnn_spec=DNNSpec(
            neurons=[2, 4],
            hidden_activation=DNNSpec.relu,
            hidden_activation_deriv=DNNSpec.relu_deriv,
            output_activation=DNNSpec.identity,
            output_activation_deriv=DNNSpec.identity_deriv
        )
    )

    raa = RunAllAlgorithms(
        mdp_refined=mdp_ref_obj,
        tolerance=this_tolerance,
        exploring_start=exploring_start,
        first_visit_mc=this_first_visit_mc,
        num_samples=num_samples,
        softmax=this_softmax,
        epsilon=this_epsilon,
        epsilon_half_life=this_epsilon_half_life,
Esempio n. 14
0
                                     cons_util_func=util_func,
                                     beq_util_func=beq_util,
                                     discount_rate=rho)

    reinforce_val = True
    num_state_samples_val = 500
    num_next_state_samples_val = 30
    num_action_samples_val = 50
    num_batches_val = 3000
    actor_lambda_val = 0.99
    critic_lambda_val = 0.99

    actor_mu = FuncApproxSpec(
        state_feature_funcs=[],
        sa_feature_funcs=[],
        dnn_spec=DNNSpec(neurons=[],
                         hidden_activation=DNNSpec.log_squish,
                         hidden_activation_deriv=DNNSpec.log_squish_deriv,
                         output_activation=DNNSpec.sigmoid,
                         output_activation_deriv=DNNSpec.sigmoid_deriv))
    actor_nu = FuncApproxSpec(
        state_feature_funcs=[],
        sa_feature_funcs=[],
        dnn_spec=DNNSpec(neurons=[],
                         hidden_activation=DNNSpec.log_squish,
                         hidden_activation_deriv=DNNSpec.log_squish_deriv,
                         output_activation=DNNSpec.pos_log_squish,
                         output_activation_deriv=DNNSpec.pos_log_squish_deriv))
    actor_mean = FuncApproxSpec(state_feature_funcs=[],
                                sa_feature_funcs=[],
                                dnn_spec=None)
    actor_variance = FuncApproxSpec(
Esempio n. 15
0
 def get_actor_mean_spec() -> FuncApproxSpec:
     return FuncApproxSpec(
         state_feature_funcs=[],
         sa_feature_funcs=[],
         dnn_spec=None
     )
Esempio n. 16
0
    epsilon_half_life_val = 1000
    learning_rate_val = 0.1
    lambda_val = 0.7
    episodes_limit = 10000
    batch_size_val = 20
    max_steps_val = 1000
    offline_val = True
    state_ff = [lambda s: float(s)]
    sa_ff = [
        lambda x: float(x[0]),
        lambda x: 1. if x[1] == 'a' else 0.,
        lambda x: 1. if x[1] == 'b' else 0.,
        lambda x: 1. if x[1] == 'c' else 0.,
    ]
    fa_spec_val = FuncApproxSpec(state_feature_funcs=state_ff,
                                 sa_feature_funcs=sa_ff,
                                 dnn_spec=None,
                                 learning_rate=learning_rate_val)
    esl_obj = TDLambda(mdp_rep_obj, exploring_start_val, algorithm_type,
                       softmax_flag, epsilon_val, epsilon_half_life_val,
                       lambda_val, episodes_limit, batch_size_val,
                       max_steps_val, fa_spec_val, offline_val)

    def policy_func(i: int) -> Mapping[str, float]:
        if i == 1:
            ret = {'a': 0.4, 'b': 0.6}
        elif i == 2:
            ret = {'a': 0.7, 'c': 0.3}
        elif i == 3:
            ret = {'b': 1.0}
        else:
            raise ValueError
Esempio n. 17
0
    def get_rl_fa_price(self, num_dt: int, method: str, exploring_start: bool,
                        algorithm: TDAlgorithm, softmax: bool, epsilon: float,
                        epsilon_half_life: float, lambd: float, num_paths: int,
                        batch_size: int, feature_funcs: Sequence[
                            Callable[[Tuple[StateType, ActionType]],
                                     float]], neurons: Optional[Sequence[int]],
                        learning_rate: float, learning_rate_decay: float,
                        adam: Tuple[bool, float,
                                    float], offline: bool) -> float:
        dt = self.expiry / num_dt

        def sa_func(_: StateType) -> Set[ActionType]:
            return {True, False}

        # noinspection PyShadowingNames
        def terminal_state(s: StateType, num_dt=num_dt) -> bool:
            return s[0] > num_dt

        # noinspection PyShadowingNames
        def sr_func(s: StateType,
                    a: ActionType,
                    num_dt=num_dt) -> Tuple[StateType, float]:
            return self.state_reward_gen(s, a, num_dt)

        def init_s() -> StateType:
            return 0, np.array([self.spot_price])

        def init_sa() -> Tuple[StateType, ActionType]:
            return init_s(), choice([True, False])

        # noinspection PyShadowingNames
        mdp_rep_obj = MDPRepForRLFA(state_action_func=sa_func,
                                    gamma=ALMOSTONEGAMMA,
                                    terminal_state_func=terminal_state,
                                    state_reward_gen_func=sr_func,
                                    init_state_gen=init_s,
                                    init_state_action_gen=init_sa)

        fa_spec = FuncApproxSpec(
            state_feature_funcs=[],
            sa_feature_funcs=feature_funcs,
            dnn_spec=(None if neurons is None else (DNNSpec(
                neurons=neurons,
                hidden_activation=DNNSpec.log_squish,
                hidden_activation_deriv=DNNSpec.log_squish_deriv,
                output_activation=DNNSpec.pos_log_squish,
                output_activation_deriv=DNNSpec.pos_log_squish_deriv))),
            learning_rate=learning_rate,
            adam_params=adam,
            add_unit_feature=False)

        if method == "MC":
            rl_fa_obj = MonteCarlo(mdp_rep_for_rl=mdp_rep_obj,
                                   exploring_start=exploring_start,
                                   softmax=softmax,
                                   epsilon=epsilon,
                                   epsilon_half_life=epsilon_half_life,
                                   num_episodes=num_paths,
                                   max_steps=num_dt + 2,
                                   fa_spec=fa_spec)
        elif method == "TD0":
            rl_fa_obj = TD0(mdp_rep_for_rl=mdp_rep_obj,
                            exploring_start=exploring_start,
                            algorithm=algorithm,
                            softmax=softmax,
                            epsilon=epsilon,
                            epsilon_half_life=epsilon_half_life,
                            num_episodes=num_paths,
                            max_steps=num_dt + 2,
                            fa_spec=fa_spec)
        elif method == "TDL":
            rl_fa_obj = TDLambda(mdp_rep_for_rl=mdp_rep_obj,
                                 exploring_start=exploring_start,
                                 algorithm=algorithm,
                                 softmax=softmax,
                                 epsilon=epsilon,
                                 epsilon_half_life=epsilon_half_life,
                                 lambd=lambd,
                                 num_episodes=num_paths,
                                 batch_size=batch_size,
                                 max_steps=num_dt + 2,
                                 fa_spec=fa_spec,
                                 offline=offline)
        elif method == "TDE":
            rl_fa_obj = TDLambdaExact(mdp_rep_for_rl=mdp_rep_obj,
                                      exploring_start=exploring_start,
                                      algorithm=algorithm,
                                      softmax=softmax,
                                      epsilon=epsilon,
                                      epsilon_half_life=epsilon_half_life,
                                      lambd=lambd,
                                      num_episodes=num_paths,
                                      batch_size=batch_size,
                                      max_steps=num_dt + 2,
                                      state_feature_funcs=[],
                                      sa_feature_funcs=feature_funcs,
                                      learning_rate=learning_rate,
                                      learning_rate_decay=learning_rate_decay)
        else:
            rl_fa_obj = LSPI(mdp_rep_for_rl=mdp_rep_obj,
                             exploring_start=exploring_start,
                             softmax=softmax,
                             epsilon=epsilon,
                             epsilon_half_life=epsilon_half_life,
                             num_episodes=num_paths,
                             batch_size=batch_size,
                             max_steps=num_dt + 2,
                             state_feature_funcs=[],
                             sa_feature_funcs=feature_funcs)

        qvf = rl_fa_obj.get_qv_func_fa(None)
        # init_s = (0, np.array([self.spot_price]))
        # val_exec = qvf(init_s)(True)
        # val_cont = qvf(init_s)(False)
        # true_false_spot_max = max(val_exec, val_cont)

        all_paths = self.get_all_paths(0.0, num_paths, num_dt)
        prices = np.zeros(num_paths)

        for path_num, path in enumerate(all_paths):
            steps = 0
            while steps <= num_dt:
                price_seq = path[:(steps + 1)]
                state = (steps, price_seq)
                exercise_price = np.exp(-self.ir(dt * steps)) *\
                    self.payoff(dt * steps, price_seq)
                continue_price = qvf(state)(False)
                steps += 1
                if exercise_price > continue_price:
                    prices[path_num] = exercise_price
                    steps = num_dt + 1
                    # print(state)
                    # print(exercise_price)
                    # print(continue_price)
                    # print(qvf(state)(True))

        return np.average(prices)