G += E*self.delta[k_id] E = self.gamma*E*((1-self.sigma[k_id])*self.target_policy(self.S[(k + 1) % (n + 1)])['probs'][self.A[(k + 1) % (n + 1)]]+self.sigma[(k + 1) % (n + 1)]) rho *= (1 - self.sigma[k_id] + self.sigma[k_id]*self.rho[k_id]) delta = rho*(G-self.Q[S_tau][A_tau]) self.Q[S_tau][A_tau] += self.alpha * delta self.t += 1 if done: self.t = 0 def get_policy(self, s, epsilon): a = np.argmax(self.Q[s]) pi_probs = np.ones(self.env.nA) * epsilon / self.env.nA pi_probs[a] += (1 - epsilon) return {'action': np.random.choice(range(self.env.nA), p=pi_probs), 'probs': pi_probs} def get_sigma(self, a): return np.random.randint(2, size=self.env.nA)[a] if __name__ == "__main__": envn = 'CliffWalking-v0' env = gym.make(envn) agent = Qsigma(env, n=3, gamma=0.9, epsilon=0.1, alpha=0.5) exp = f"experiments/{envn}_{agent}" train(env, agent, exp, num_episodes=50, max_runs=5) main_plot(exp, smoothing_window=10) plt.ylim([-100, 0]) plt.show()
for _ in range(50): agents = [ SarsaAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma), ExpSarsaAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma), QslAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma, sigma_strat='static', sigma=0.5, lamb=1), QslAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma, lamb=0.8) ] experiments = [] for agent in agents: expn = f"experiments/{str(agent)}" train(env, agent, expn, num_episodes=num_episodes, max_runs=100) experiments.append(expn) return experiments if __name__ == "__main__": envn = 'StochWindyGridWorld-v0' env = gym.make(envn) experiments = run_exp(env, num_episodes=200) main_plot(experiments, smoothing_window=15) plt.ylim([-100, -30]) plt.savefig('plot.png') plt.show()
""" self.a = self.pi_eps(sp) if not done else -1 # !b #!b self.a = .... pi_probs = self.pi_probs(sp) exp_sarsa_target = np.dot(pi_probs, self.Q[sp]) """ now that you know A' = self.a, perform the update to self.Q[s][a] here """ delta = r + (self.gamma * exp_sarsa_target if not done else 0) - self.Q[s][a] # !b self.Q[s][a] += self.alpha * delta # !b self.t = 0 if done else self.t + 1 # update current iteration number def __str__(self): return f"ExpSarsa($\\gamma={self.gamma},\\epsilon={self.epsilon},\\alpha={self.alpha}$)" def experiment(): envn = 'StochWindyGridWorld-v0' env = gym.make(envn) agent = ExpSarsaAgent(env, epsilon=0.1, alpha=0.5) exp = f"experiments/{str(agent)}" train(env, agent, exp, num_episodes=200, max_runs=10) return env, exp if __name__ == "__main__": env, q_experiment = q_agent_exp() # get results from Q-learning env, sarsa_exp = experiment() main_plot([q_experiment, sarsa_exp], smoothing_window=10) plt.ylim([-100, 0]) plt.title("Q and Sarsa learning on " + env.spec._env_name) plt.show()