Beispiel #1
0
                    mse[i // mse_update] += (
                        Q.forward(phi(s, env.action_space[a])) -
                        QStar[s][a])**2
            mse[i // mse_update] /= QStar_sa_count

    return Q, mse


if __name__ == "__main__":
    env = Env()

    QStar = pickle.load(open('Q.dill', 'rb'))

    lambds = list(np.arange(0, 1.1, 0.1))
    mseLambdas = np.zeros((len(lambds), NUM_EPISODES // MSE_UPDATE))
    finalMSE = np.zeros(len(lambds))

    print("Training.")
    mse = defaultdict(float)
    for i, lambd in enumerate(lambds):
        Q, mse = func_approx(env, lambd, QStar)
        mseLambdas[i] = mse
        finalMSE[i] = mse[-1]

        print(f"Lambda {lambd}: Final MSE {mse[-1]}")
    print()

    print("Plotting.")
    utils.plotMseLambdas(finalMSE, lambds)
    utils.plotMseEpisodesLambdas(mseLambdas)
Beispiel #2
0
def td_control(env, trueQ):

    actions = env.action_space
    lambdas = cf['lambdas']
    mselambdas = np.zeros((len(lambdas), cf['n_episodes']))
    finalMSE = np.zeros(len(lambdas))

    # instantiate epsilon_t for e-greedy exploration strategy
    epsilon_t = exploration(cf['N0'])

    for i_lambda, lambda_decay in enumerate(lambdas):
        Q, Nsa, wins = reset(env.dim)

        for episode in range(
                cf['n_episodes']):  #terminal state for exploration?
            done = 0
            E = np.zeros(env.dim)
            SA = list()  # state, action

            #init state
            s = env.reset()
            #Pick an action with e-greedy policy
            a = e_greedy_policy(Q[s], actions, epsilon_t(Nsa[s]))

            while not done:
                #Forward a step
                s_next, reward, done, _ = env.step(a)

                if done:  # Sarsa lambda Update
                    td_error = reward - Q[s[0], s[1], a]
                else:  #Pick an action with e-greedy policy
                    a_next = e_greedy_policy(Q[s_next], actions,
                                             epsilon_t(Nsa[s_next]))
                    td_error = reward + cf['r_gamma'] * Q[s_next[0], s_next[1],
                                                          a_next] - Q[s[0],
                                                                      s[1], a]

                #Add s(t),a(t) to the episode history
                SA.append([s, a])
                E[s[0], s[1], a] += 1
                Nsa[s[0], s[1], a] += 1

                #update action-value function Q
                for (s, a) in SA:
                    Q[s[0], s[1],
                      a] += 1 / Nsa[s[0], s[1], a] * td_error * E[s[0], s[1],
                                                                  a]
                    E[s[0], s[1], a] *= cf['r_gamma'] * lambda_decay

                if not done:
                    s = s_next
                    a = a_next

            # bookkeeping
            if reward == 1:
                wins += 1
            mse = np.sum(
                np.square(Q - trueQ)) / (env.dim[0] * env.dim[1] * env.dim[2])
            mselambdas[i_lambda, episode] = mse

        finalMSE[int(i_lambda)] = mse
        print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f" %
              (lambda_decay, episode, mse, wins / (episode + 1)))
        print("--------")

    utils.plotMseLambdas(finalMSE, lambdas)
    utils.plotMseEpisodesLambdas(mselambdas)
Beispiel #3
0
            if not terminated:
                state, a = statePrime, aPrime

        # bookkeeping
        if r == 1:
            wins += 1

        mse = np.sum(np.square(allQ() - trueQ.ravel())) / (21 * 10 * 2)
        mselambdas[li, episode] = mse

        if episode % 1000 == 0 or episode + 1 == episodes:
            print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f" %
                  (lmd, episode, mse, wins / (episode + 1)))

    finalMSE[li] = mse
    print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f" %
          (lmd, episode, mse, wins / (episode + 1)))
    print("--------")

# GRAPHING

all_MSEs = [mselambdas[0], mselambdas[1]]
lambdas = [0, 1]
title = 'Linear Function Approximation, MSE as a function of Lambdas in TD(lambda)'
utils.plotMseEpisodes(all_MSEs, lambdas, title)

lambdas = np.arange(0, 1.1, 0.1)
title = 'LFA, TD(lambda) MSE as a function of lambda in Sarsa(lambda)'
mselambdas.shape
utils.plotMseLambdas(lambdas, finalMSE, title)
Beispiel #4
0
# GRAPHING ---


episodes = 1000
N_0 = 100
gamma = 1
lam = 0.5
remember_every_MSE = False

lambdas = np.arange(0, 1.1, 0.1)
MSE_for_each_lambda = [((TD_Learning(episodes, N_0, gamma, lam, remember_every_MSE) - trueQ) ** 2).mean(axis=None) for
                       lam in lambdas]
title = 'TD Learning comparing variable lambda in Sarsa(lambda)'

utils.plotMseLambdas(lambdas, MSE_for_each_lambda, title)

episodes = 10000
remember_every_MSE = True
all_MSEs = list()
lambdas = [0, 1]
for lam in lambdas:
    all_MSEs.append(TD_Learning(episodes, N_0, gamma, lam, remember_every_MSE))
title = 'MSE over course of learning for TD Learning using Sarsa(lambda = {0, 1})'

utils.plotMseEpisodes(all_MSEs, lambdas, title)

lam = 0.3
remember_every_MSE = False
Q_s_a = TD_Learning(episodes, N_0, gamma, lam, remember_every_MSE)
title = 'optimal policy at lambda = {}'.format(lam)