def run_loop(env, agent, title, max_e=1000, render=False, update=True, plot_frequency=5e3): t = 0 i = 0 e = 0 s, r, d, _ = env.reset() a_ = agent.action(s) ep_lens = [] rewards = [] r_sum = 0 since_last_plot = 0 while True: i += 1 t += 1 since_last_plot += 1 a = a_ s_, r, d, _ = env.step(a) a_ = agent.action(s_) if update: agent.update(s=s, a=a, r=r, s_=s_, a_=a_, d=d) r_sum += r s = np.copy(s_) if render: QL_utils.render_helper(env, title, i) if d or i > 1e6: if since_last_plot > plot_frequency: since_last_plot = 0 QL_utils.plot_helper(title, e, agent, env) ep_lens.append(i) rewards.append(r_sum) r_sum = 0 e += 1 i = 0 s, r, d, _ = env.reset() a_ = agent.action(s) if max_e and e >= max_e: break return ep_lens, rewards
TN_QLearning_rewards = [] env = Cliff() for i in range(num_runs): # Create agent TN_QLearning = TabularNStepQLearning(env.state_shape, env.num_actions, n=n) # Run training loop _, rewards = run_loop(env, TN_QLearning, str(n) + '-step QLearning, run: ' + str(i)) TN_QLearning_rewards.append(rewards) TN_QLearning_rewards = np.array(TN_QLearning_rewards) # Run the last QLearning agent using visualizations. # Try running this a couple of times run_loop(env, TN_QLearning, 'QLearning, n=' + str(n), max_e=1, render=True) # Plot the rewards plt.figure() include_sd = False # include standard deviation in plot QL_utils.reward_plotter(TN_QLearning_rewards, 'QLearning', 'r', include_sd=include_sd, smooth_factor=2) axes = plt.gca() axes.set_ylim([-100, 0]) plt.show()