Esempio n. 1
0
def sarsa_lambda(max_episode, gamma, lbd, N0, optimal_Q=None):
    # sarsa(lambda)
    Q = np.zeros([21, 10, 2])
    N = np.zeros([21, 10, 2])
    mse = []
    for i in range(max_episode):
        # initial eligibility traces
        E = np.zeros([21, 10, 2])
        # initial a new episode
        episode = Easy21()
        x, y = episode.State()
        action = epsilon_greedy(N0, N, Q, x, y)
        # sample until terminal
        while not episode.is_game_end():
            N[x - 1, y - 1, action] += 1
            E[x - 1, y - 1, action] += 1
            # run one step
            ([xp, yp], reward) = episode.step(action)
            if episode.is_game_end():
                # if the episode is in terminal state, Q[s', a'] is 0
                delta = reward - Q[x - 1, y - 1, action]
                actionp = 0
            else:
                actionp = epsilon_greedy(N0, N, Q, xp, yp)
                delta = reward + gamma * Q[xp - 1, yp - 1, actionp] - Q[x - 1,
                                                                        y - 1,
                                                                        action]
            alpha = 1.0 / N[x - 1, y - 1, action]
            Q += (alpha * delta * E)
            E *= (gamma * lbd)
            x, y, action = xp, yp, actionp
        if (i % 1000 == 0) and (optimal_Q is not None):
            mse.append(np.sum((Q - optimal_Q)**2))
    return (Q, mse)
Esempio n. 2
0
def monte_carlo(max_episode, gamma, N0):
    # monte carlo
    Q = np.zeros([21, 10, 2])
    N = np.zeros([21, 10, 2])
    for i in range(max_episode):
        # initial a new episode
        episode = Easy21()
        # the initial state of the episode
        x, y = episode.State()
        # sample until terminal
        history = []
        while not episode.is_game_end():
            # decide action
            action = epsilon_greedy(N0, N, Q, x, y)
            N[x - 1, y - 1, action] += 1
            # run one step
            state, reward = episode.step(action)
            history.append(([x, y], action, reward))
            [x, y] = state
        # calculate return Gt for each state in this episode
        Gt = 0
        for j, (state, action, reward) in enumerate(reversed(history)):
            [x, y] = state
            alpha = 1.0 / N[x - 1, y - 1, action]
            Gt = gamma * Gt + reward
            Q[x - 1, y - 1, action] += alpha * (Gt - Q[x - 1, y - 1, action])
    return Q
Esempio n. 3
0
import numpy as np
import dill as pickle

from environment import Easy21
import utils
import time

toc = time.time()

env = Easy21()
N0 = 100
actions = [0, 1]


def reset():
    Q = np.zeros((22, 11, len(actions)))
    NSA = np.zeros((22, 11, len(actions)))
    wins = 0

    return Q, NSA, wins


Q, NSA, wins = reset()
trueQ = pickle.load(open('Q.dill', 'rb'))
NS = lambda p, d: np.sum(NSA[p, d])

alpha = lambda p, d, a: 1 / NSA[p, d, a]
eps = lambda p, d: N0 / (N0 + NS(p, d))


# policy improvement - by epsilon-greedy