Ejemplo n.º 1
0
def SARSA_lambda(process: MDP,
                 env: Environment,
                 lambda_: float = 0.7,
                 alpha: float = 0.01,
                 n_iter: int = 5000,
                 max_ep_len: int = 200):

    Q_value = np.zeros((process.nb_states, process.nb_actions))
    for i in range(1, n_iter + 1):

        epsilon = 1 / i

        # ---- Init Eligibility Trace ----
        E_t = np.zeros((process.nb_states, process.nb_actions))
        current_state = env.get_random_state()
        counter = 1
        ep_finished = False
        while not ep_finished:

            # ---- Update Policy ----
            policy = process.get_Q_policy(Q_value, epsilon)
            env.policy = policy

            # ---- MDP stepping ----
            current_action = env.generate_action(current_state)
            reward = env.generate_return(current_state, current_action)
            next_state = env.step(current_state, current_action)
            next_action = env.generate_action(next_state)

            # ---- Updating Eligibility Trace ----
            E_t[current_state.index, current_action.index] += 1
            E_t *= process.disc_fact * lambda_

            # ---- Updating Q_value function ----
            error = reward + process.disc_fact * Q_value[next_state.index, next_action.index] - \
                     Q_value[current_state.index, current_action.index]
            Q_value += alpha * error * E_t
            current_state = next_state
            current_action = next_action

            # ---- Stop Condition ----
            counter += 1
            if next_state.terminal:
                ep_finished = True
            if counter > max_ep_len:
                ep_finished = True

    return (process.get_Q_policy(Q_value))
Ejemplo n.º 2
0
def GLIE(process: MDP,
         env: Environment,
         n_iter: int = 5000,
         eps: float = 0.01):

    Q_value = np.zeros((process.nb_states, process.nb_actions))
    count_state_action = np.zeros((process.nb_states, process.nb_actions))
    for i in range(1, n_iter + 1):
        epsilon = 1 / i
        policy = process.get_Q_policy(Q_value, epsilon)
        env.policy = policy
        states, actions, returns = env.generate_episode()
        G = 0
        for j in range(len(returns) - 1, 0, -1):
            G = process.disc_fact * G + returns[j]
            current_state = states[j]
            current_action = actions[j]
            count_state_action[current_state.index, current_action.index] += 1
            Q_value[current_state.index, current_action.index] += \
            (G - Q_value[current_state.index, current_action.index]) / count_state_action[current_state.index, current_action.index]
    return process.get_Q_policy(Q_value)
Ejemplo n.º 3
0
def Q_learning(process: MDP,
               env: Environment,
               lambda_: float = 0.7,
               alpha: float = 0.01,
               n_iter: int = 5000,
               max_ep_len: int = 200):

    Q_value = np.zeros((process.nb_states, process.nb_actions))
    for i in range(1, n_iter + 1):

        epsilon = 1 / i
        current_state = env.get_random_state()
        counter = 1
        ep_finished = False
        while not ep_finished:

            # ---- Update Policy ----
            policy = process.get_Q_policy(Q_value, epsilon)
            env.policy = policy

            # ---- MDP stepping ----
            current_action = env.generate_action(current_state)
            reward = env.generate_return(current_state, current_action)
            next_state = env.step(current_state, current_action)

            # ---- Updating Q_value function ----

            Q_value[current_state.index, current_action.index] += alpha * (reward + \
                   process.disc_fact * Q_value[next_state.index,:].max() - \
                   Q_value[current_state.index, current_action.index])
            current_state = next_state

            # ---- Stop Condition ----
            counter += 1
            if next_state.terminal:
                ep_finished = True
            if counter > max_ep_len:
                ep_finished = True

    return (process.get_Q_policy(Q_value))