コード例 #1
0
num_consecutive_iterations = 100
last_time_steps = np.zeros(
    num_consecutive_iterations)  # 只存储最近100场的得分(可以理解为是一个容量为100的栈)

env = gym.make('CartPole-v0')
for episode in range(5000):
    observation = env.reset()  # 初始化本场游戏的环境
    episode_reward = 0
    action = q_f.get_actions(observation)
    next_action = action
    for t in range(max_number_of_steps):
        #        env.render()
        next_observation, reward, done, info = env.step(action)
        next_action = q_f.get_actions(next_observation)

        action = q_f.learn(observation, action, next_observation, next_action,
                           -200 if done else reward)
        observation = next_observation
        action = next_action
        episode_reward += reward
        if done:
            print('%d Episode finished after %f time steps / mean %f' %
                  (episode, t + 1, last_time_steps.mean()))
            last_time_steps = np.hstack(
                (last_time_steps[1:], [episode_reward]))  # 更新最近100场游戏的得分stack
            break
    # 如果最近100场平均得分高于195
    if (last_time_steps.mean() >= goal_average_steps):
        print('Episode %d train agent successfuly!' % episode)
        break
コード例 #2
0
               learning_rate=0.8,
               discount_rate=0.9,
               random_action_prob=0.5,
               random_action_decay_rate=0.99,
               dyna_iterations=0)

    start_state = gw.grid_coordinates_to_indices(start)

    iterations = 1000
    ### IMPORTANT
    # you do need to write your own generate_experience function
    # based on either \epilon greedy or exploration function
    # make sure on your submission, you need to submit
    # a new rl_qlearn.py that has the updated gw.generate_experience_your_name
    ### IMPORTANT
    flat_policies, flat_utilities = sa.learn(start_state,
                                             gw.generate_experience,
                                             iterations=iterations)

    new_shape = (gw.shape[0], gw.shape[1], iterations)
    sa_utility_grids = flat_utilities.reshape(new_shape)
    sa_policy_grids = flat_policies.reshape(new_shape)
    print('Final result of Sarsa:')
    print(sa_policy_grids[:, :, -1])
    print(sa_utility_grids[:, :, -1])

    plt.figure()
    gw.plot_policy(sa_utility_grids[:, :, -1], sa_policy_grids[:, :, -1])
    plot_convergence(sa_utility_grids, sa_policy_grids)
    plt.show()