コード例 #1
0
def J_N(x, mu, N, discount_factor=0.99):  # computes J(mu,N,x)
    if N < 0:
        print("N cannot be negative !")
        exit()
    elif N == 0:
        return 0
    else:
        new_state = env.f(x, mu(x))
        r = env.rewards[new_state[0]][new_state[1]]
        return r + discount_factor * J_N(env.f(x, mu(x)), mu, N - 1)
コード例 #2
0
def optimal_policy(N):
    """
    compute the optimal policy for the environnment
    """

    """ exact probability """
    p = {}
    """ exact reward"""
    r = {}

    for x in env.state_space:
        for u in env.action_space:
            for next_state in env.state_space:
                p[(x, u, next_state)] = 0

            new_state = env.f(x, u)
            p[(x, u, new_state)] = 1
            r[(x, u)] = env.rewards[new_state[0]][new_state[1]]

    """ compute the exact optimal policy """
    Q = {}
    for x in env.state_space:
        for u in env.action_space:
            Q[(x, u)] = Q_N(p, r, x, u, N)

    # return determine_optimal_policy_from_Q(Q)
    return Q
コード例 #3
0
def influence_of_T_on_Q(T, N):
    """
    display the difference between the Q calculated with the MDP structure
    and the Q calculated with the exact p and r
    """
    # exact probability
    p = {}

    # exact reward
    r = {}

    # instantiate the exact probability and reward function
    for x in env.state_space:
        for u in env.action_space:
            for next_state in env.state_space:
                p[(x, u, next_state)] = 0

            new_state = env.f(x, u)
            p[(x, u, new_state)] = 1
            r[(x, u)] = env.rewards[new_state[0]][new_state[1]]

    Q = {}
    for x in env.state_space:
        for u in env.action_space:
            history, p_appr, r_appr = tj.create_trajectory((3, 0), T)
            Q_optimal = round(fn.Q_N(p, r, x, u, N), 2)
            Q_learned = round(fn.Q_N(p_appr, r_appr, x, u, N), 2)
            Q[(x, u)] = (Q_optimal, Q_learned)

    for key in list(Q.keys()):
        diff = abs(Q[key][0] - Q[key][1])
        str_x = "(x,u) = " + str(key) + " | Q_exact = " + str(Q[key][0]) + " | Q_appr = " + str(Q[key][1]) + " | diff = " + str(diff)
        print(str_x)

    return Q
コード例 #4
0
def J_N(x, mu, N, discount_factor=0.99):
    """
    computes J state-value recurrence function with policy µ
    """
    if N < 0:
        print("N cannot be negative !")
        return None
    elif N == 0:
        return 0
    else:
        new_state = env.f(x, mu[x])
        return env.rewards[new_state[0]][new_state[1]] + discount_factor*J_N(new_state, mu, N-1)
コード例 #5
0
def J_N(x, mu, r, N, discount_factor=0.99):
    """
    computes J state-value recurrence function
    """
    if N < 0:
        print("N cannot be negative !")
        exit()
    elif N == 0:
        return 0
    else:
        return r[
            (x, mu[x])] + discount_factor * J_N(env.f(x, mu[x]), mu, r, N - 1)
コード例 #6
0
def protocol_1(discount_factor=0.99, alpha=0.05, epsilon=0.25):
    """
    first experimental protocol
    """
    error = []
    Q = {}

    # initialization
    for x in env.state_space:
        for u in env.action_space:
            # initialize Q to 0 everywhere
            Q[(x, u)] = 0

    for episode in range(100):
        # initial state
        state = (3, 0)

        for transition in range(1000):
            action = None
            p = np.random.default_rng().random()

            # epsilon-greedy policy
            if p < 1 - epsilon:  # exploitation
                action = get_max_action(Q, state)
            else:  # exploration
                action = tj.policy()

            next_state = env.f(state, action)

            # extract reward associates with state x and action u
            reward = env.rewards[next_state[0]][next_state[1]]

            # compute the max value of Q for the state x'
            maxQ = get_max_value(Q, next_state)

            # update Q
            Q[(state, action)] = (1 - alpha) * Q[
                (state, action)] + alpha * (reward + discount_factor * maxQ)

            state = next_state

        print("episode : " + str(episode + 1))
        error.append(display(Q))

    return error
コード例 #7
0
def protocol_3(discount_factor=0.99, alpha=0.05, epsilon=0.25):
    """
    third experimental protocol
    """
    error = []
    Q = {}

    for x in env.state_space:
        for u in env.action_space:
            Q[(x, u)] = 0

    for episode in range(100):
        state = (3, 0)
        buffer = []
        for transition in range(1000):
            action = None
            p = np.random.default_rng().random()

            if p < 1 - epsilon:
                action = get_max_action(Q, state)
            else:
                action = tj.policy()

            next_state = env.f(state, action)
            reward = env.rewards[next_state[0]][next_state[1]]

            # add the transition to the buffer
            buffer.append((state, action, reward, next_state))

            # update Q ten times using the buffer
            for count in range(10):
                index = np.random.randint(0, len(buffer))
                x, u, r, next_x = buffer[index]
                maxQ = get_max_value(Q, next_x)

                Q[(state, action)] = (1 - alpha) * Q[(
                    state, action)] + alpha * (reward + discount_factor * maxQ)

            state = next_state

        print("episode : " + str(episode + 1))
        error.append(display(Q))

    return error
コード例 #8
0
def create_trajectory(initial_x, T):
    """
    creates ht (with ressources limitation algorithm)
    """
    trajectory = []
    N = {}
    R = {}
    Nx = {}
    p = {}
    r = {}

    for x in env.state_space:  # initializations for r and p computing
        for u in env.action_space:
            N[(x, u)] = 0
            R[(x, u)] = 0
            r[(x, u)] = -1000
            for x0 in env.state_space:
                Nx[(x, u, x0)] = 0
                p[(x, u, x0)] = 0

    x = initial_x
    for i in range(T):  # random trajectory computing
        u = policy()
        new_x = env.f(x, u)
        rew = env.rewards[new_x[0]][new_x[1]]
        trajectory.append([x, u, rew, new_x
                           ])  # add current information to trajectory history

        N[(x, u)] += 1
        Nx[(x, u, new_x)] += 1
        R[(x, u)] += rew
        r[(x, u)] = R[(x, u)] / N[(x, u)]  # mean of all the rewards
        p[(x, u, new_x)] = Nx[(x, u, new_x)] / N[
            (x, u)]  # probability reaching state new_x with (x,u)

        x = new_x

    return trajectory, p, r
コード例 #9
0
def protocol_2(discount_factor=0.99, epsilon=0.25):
    """
    second experimental protocol
    """
    error = []
    Q = {}

    for x in env.state_space:
        for u in env.action_space:
            Q[(x, u)] = 0

    for episode in range(100):
        alpha = 0.05
        state = (3, 0)
        for transition in range(1000):
            action = None
            p = np.random.default_rng().random()

            if p < 1 - epsilon:
                action = get_max_action(Q, state)
            else:
                action = tj.policy()

            next_state = env.f(state, action)
            reward = env.rewards[next_state[0]][next_state[1]]
            maxQ = get_max_value(Q, next_state)

            Q[(state, action)] = (1 - alpha) * Q[(state, action)] + alpha * (
                reward + discount_factor * maxQ)  # update Q

            state = next_state
            alpha *= 0.8

        print("episode : " + str(episode + 1))
        error.append(display(Q))

    return error
コード例 #10
0
def convergence_speed():
    """
    Computes and display the convergence of p and r
    """
    p_error = []
    r_error = []
    T = [i for i in range(100, 1200, 50)]

    # we compute the error of the approximations for different trajectory length
    for t in T:
        history, p, r = tj.create_trajectory((3, 0), T)
        p_sum = 0
        r_sum = 0

        # compute the error
        for x in env.state_space:
            for u in env.action_space:
                new_state = env.f(x, u)
                p_sum += (1 - p[(x, u, new_state)])
                r_sum += abs(r[(x, u)] - env.rewards[new_state[0]][new_state[1]])

        p_error.append(p_sum)
        r_error.append(r_sum)

    # plot the convergence of p and r, along T
    fig, axs = plt.subplots(2, 1, figsize=(10, 10), constrained_layout=True)
    axs[0].plot(T, p_error)
    axs[0].set_ylabel('$p_{error}$')
    axs[0].set_xlabel('T')
    axs[0].set_title('Convergence speed of $\^p$')
    axs[1].plot(T, r_error)
    axs[1].set_ylabel('$r_{error}$')
    axs[1].set_xlabel('T')
    axs[1].set_title('Convergence speed of $\^r$')
    plt.show()

    return T, p_error, r_error
コード例 #11
0
def Q_N(p, r, state, action, N, discount_factor=0.99):  # computes Q state-action value recurrence function
    if N < 0:
        print("N can't be negative")
        return None
    elif N == 0:
        return 0
    else:
        sum_Q = 0

        for u in env.action_space:
            # we are only looking for the state which are 'reachable' from x since others one will have p(x'|x,u)=0
            x = env.f(state, u)
            Qs = []

            # store the reward recording for each action
            for u1 in env.action_space:
                Qs.append(Q_N(p, r, x, u1, N-1))

            # look for which action gives best reward
            max_Q = max(Qs)

            # actualize the sum term in Qn recurrence formula
            sum_Q += p[(state, action, x)]*max_Q
        return r[(state, action)] + discount_factor*sum_Q
コード例 #12
0
    fig, ax = plt.subplots(figsize=(4, 4))
    ax.axis('off')
    cell_text = np.asarray(env.rewards, dtype=np.str)
    colors = [["w" for i in range(env.m)] for j in range(env.n)]
    # colors = [["w", "w", "w", "w", "w"], ["w", "w", "w", "w", "w"], ["w", "w", "w", "w", "w"],
    # ["w", "w", "w", "w", "w"], ["w", "w", "w", "w", "w"]]
    colors[position[0]][position[1]] = "r"
    ax.table(cellText=cell_text, cellColours=colors, cellLoc='center', loc='center',
             colWidths=[0.07, 0.07, 0.07, 0.07, 0.07])
    plt.title(str_x, fontdict={'fontsize': 8})
    plt.show()


if __name__ == '__main__':
    s = (3, 0)  # initial state
    t = 0  # time
    str_x = "state = " + str(s) + " | t = " + str(t)
    print(str_x)
    draw(s, str_x)

    while True:
        u = policy(s)  # compute the policy with actual state
        x = env.f(s, u, 0.1)  # compute new state
        r = round((0.99**t)*env.rewards[x[0]][x[1]], 4)  # compute reward
        s = x  # update state
        t += 1
        str_x = "state = " + str(x) + " | action = " + str(u) + " | reward = " + str(r) + " | t = " + str(t)
        print(str_x)
        draw(s, str_x)
        time.sleep(2)  # in order to have time to visualizes the environment