mse[i // mse_update] += ( Q.forward(phi(s, env.action_space[a])) - QStar[s][a])**2 mse[i // mse_update] /= QStar_sa_count return Q, mse if __name__ == "__main__": env = Env() QStar = pickle.load(open('Q.dill', 'rb')) lambds = list(np.arange(0, 1.1, 0.1)) mseLambdas = np.zeros((len(lambds), NUM_EPISODES // MSE_UPDATE)) finalMSE = np.zeros(len(lambds)) print("Training.") mse = defaultdict(float) for i, lambd in enumerate(lambds): Q, mse = func_approx(env, lambd, QStar) mseLambdas[i] = mse finalMSE[i] = mse[-1] print(f"Lambda {lambd}: Final MSE {mse[-1]}") print() print("Plotting.") utils.plotMseLambdas(finalMSE, lambds) utils.plotMseEpisodesLambdas(mseLambdas)
def td_control(env, trueQ): actions = env.action_space lambdas = cf['lambdas'] mselambdas = np.zeros((len(lambdas), cf['n_episodes'])) finalMSE = np.zeros(len(lambdas)) # instantiate epsilon_t for e-greedy exploration strategy epsilon_t = exploration(cf['N0']) for i_lambda, lambda_decay in enumerate(lambdas): Q, Nsa, wins = reset(env.dim) for episode in range( cf['n_episodes']): #terminal state for exploration? done = 0 E = np.zeros(env.dim) SA = list() # state, action #init state s = env.reset() #Pick an action with e-greedy policy a = e_greedy_policy(Q[s], actions, epsilon_t(Nsa[s])) while not done: #Forward a step s_next, reward, done, _ = env.step(a) if done: # Sarsa lambda Update td_error = reward - Q[s[0], s[1], a] else: #Pick an action with e-greedy policy a_next = e_greedy_policy(Q[s_next], actions, epsilon_t(Nsa[s_next])) td_error = reward + cf['r_gamma'] * Q[s_next[0], s_next[1], a_next] - Q[s[0], s[1], a] #Add s(t),a(t) to the episode history SA.append([s, a]) E[s[0], s[1], a] += 1 Nsa[s[0], s[1], a] += 1 #update action-value function Q for (s, a) in SA: Q[s[0], s[1], a] += 1 / Nsa[s[0], s[1], a] * td_error * E[s[0], s[1], a] E[s[0], s[1], a] *= cf['r_gamma'] * lambda_decay if not done: s = s_next a = a_next # bookkeeping if reward == 1: wins += 1 mse = np.sum( np.square(Q - trueQ)) / (env.dim[0] * env.dim[1] * env.dim[2]) mselambdas[i_lambda, episode] = mse finalMSE[int(i_lambda)] = mse print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f" % (lambda_decay, episode, mse, wins / (episode + 1))) print("--------") utils.plotMseLambdas(finalMSE, lambdas) utils.plotMseEpisodesLambdas(mselambdas)
if not terminated: state, a = statePrime, aPrime # bookkeeping if r == 1: wins += 1 mse = np.sum(np.square(allQ() - trueQ.ravel())) / (21 * 10 * 2) mselambdas[li, episode] = mse if episode % 1000 == 0 or episode + 1 == episodes: print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f" % (lmd, episode, mse, wins / (episode + 1))) finalMSE[li] = mse print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f" % (lmd, episode, mse, wins / (episode + 1))) print("--------") # GRAPHING all_MSEs = [mselambdas[0], mselambdas[1]] lambdas = [0, 1] title = 'Linear Function Approximation, MSE as a function of Lambdas in TD(lambda)' utils.plotMseEpisodes(all_MSEs, lambdas, title) lambdas = np.arange(0, 1.1, 0.1) title = 'LFA, TD(lambda) MSE as a function of lambda in Sarsa(lambda)' mselambdas.shape utils.plotMseLambdas(lambdas, finalMSE, title)
# GRAPHING --- episodes = 1000 N_0 = 100 gamma = 1 lam = 0.5 remember_every_MSE = False lambdas = np.arange(0, 1.1, 0.1) MSE_for_each_lambda = [((TD_Learning(episodes, N_0, gamma, lam, remember_every_MSE) - trueQ) ** 2).mean(axis=None) for lam in lambdas] title = 'TD Learning comparing variable lambda in Sarsa(lambda)' utils.plotMseLambdas(lambdas, MSE_for_each_lambda, title) episodes = 10000 remember_every_MSE = True all_MSEs = list() lambdas = [0, 1] for lam in lambdas: all_MSEs.append(TD_Learning(episodes, N_0, gamma, lam, remember_every_MSE)) title = 'MSE over course of learning for TD Learning using Sarsa(lambda = {0, 1})' utils.plotMseEpisodes(all_MSEs, lambdas, title) lam = 0.3 remember_every_MSE = False Q_s_a = TD_Learning(episodes, N_0, gamma, lam, remember_every_MSE) title = 'optimal policy at lambda = {}'.format(lam)