plt.figure(figsize=[10, 4]) rewards = [] # Training loop for i in range(max_iterations): # Play & train game # Update rewards # rewards # Decay agent epsilon # agent.epsilon = ? initial_state = env.reset() rewards.append(play_and_train(env, agent)) agent.epsilon = agent.epsilon * discount if i % 100 == 0: print('Iteration {}, Average reward {:.2f}, Epsilon {:.3f}'.format( i, np.mean(rewards), agent.epsilon)) if visualize: plt.subplot(1, 2, 1) plt.plot(rewards, color='r') plt.xlabel('Iterations') plt.ylabel('Total Reward') plt.subplot(1, 2, 2) plt.hist(rewards, bins=20, range=[-700, +20],
discount = 0.99 agent = QLearningAgent(alpha, epsilon, discount, getActionRange) agent2 = SarsaAgent(alpha, epsilon, discount, getActionRange) plt.figure(figsize=[10, 4]) rewards1 = [] rewards2 = [] # Training loop for i in range(max_iterations): # Play & train game rewards1.append(play_and_train(env, agent)) rewards2.append(play_and_train(env, agent2)) if (i + 1) % 100 == 0: agent.epsilon = max(agent.epsilon * 0.99, 0.00001) agent2.epsilon = max(agent2.epsilon * 0.99, 0.00001) # agent.alpha = max(agent.alpha * 0.99, 0.00001) # agent2.alpha = max(agent2.alpha * 0.99, 0.00001) if i % 100 == 0: print( 'Iteration {}, Average reward {:.2f}, Average reward {:.2f}, Epsilon {:.3f}' .format(i, np.mean(rewards1[-100:]), np.mean(rewards2[-100:]), agent.epsilon)) if visualize: plt.subplot(1, 2, 1) plt.plot(rewards1, color='r') plt.plot(rewards2, color='b') plt.xlabel('Iterations')