num_consecutive_iterations = 100 last_time_steps = np.zeros( num_consecutive_iterations) # 只存储最近100场的得分(可以理解为是一个容量为100的栈) env = gym.make('CartPole-v0') for episode in range(5000): observation = env.reset() # 初始化本场游戏的环境 episode_reward = 0 action = q_f.get_actions(observation) next_action = action for t in range(max_number_of_steps): # env.render() next_observation, reward, done, info = env.step(action) next_action = q_f.get_actions(next_observation) action = q_f.learn(observation, action, next_observation, next_action, -200 if done else reward) observation = next_observation action = next_action episode_reward += reward if done: print('%d Episode finished after %f time steps / mean %f' % (episode, t + 1, last_time_steps.mean())) last_time_steps = np.hstack( (last_time_steps[1:], [episode_reward])) # 更新最近100场游戏的得分stack break # 如果最近100场平均得分高于195 if (last_time_steps.mean() >= goal_average_steps): print('Episode %d train agent successfuly!' % episode) break
learning_rate=0.8, discount_rate=0.9, random_action_prob=0.5, random_action_decay_rate=0.99, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 ### IMPORTANT # you do need to write your own generate_experience function # based on either \epilon greedy or exploration function # make sure on your submission, you need to submit # a new rl_qlearn.py that has the updated gw.generate_experience_your_name ### IMPORTANT flat_policies, flat_utilities = sa.learn(start_state, gw.generate_experience, iterations=iterations) new_shape = (gw.shape[0], gw.shape[1], iterations) sa_utility_grids = flat_utilities.reshape(new_shape) sa_policy_grids = flat_policies.reshape(new_shape) print('Final result of Sarsa:') print(sa_policy_grids[:, :, -1]) print(sa_utility_grids[:, :, -1]) plt.figure() gw.plot_policy(sa_utility_grids[:, :, -1], sa_policy_grids[:, :, -1]) plot_convergence(sa_utility_grids, sa_policy_grids) plt.show()