Beispiel #1
0
def run_loop(env,
             agent,
             title,
             max_e=1000,
             render=False,
             update=True,
             plot_frequency=5e3):
    t = 0
    i = 0
    e = 0
    s, r, d, _ = env.reset()
    a_ = agent.action(s)
    ep_lens = []
    rewards = []
    r_sum = 0
    since_last_plot = 0

    while True:
        i += 1
        t += 1
        since_last_plot += 1
        a = a_
        s_, r, d, _ = env.step(a)
        a_ = agent.action(s_)

        if update:
            agent.update(s=s, a=a, r=r, s_=s_, a_=a_, d=d)
        r_sum += r
        s = np.copy(s_)

        if render:
            QL_utils.render_helper(env, title, i)

        if d or i > 1e6:
            if since_last_plot > plot_frequency:
                since_last_plot = 0
                QL_utils.plot_helper(title, e, agent, env)

            ep_lens.append(i)
            rewards.append(r_sum)
            r_sum = 0
            e += 1
            i = 0
            s, r, d, _ = env.reset()
            a_ = agent.action(s)

        if max_e and e >= max_e:
            break

    return ep_lens, rewards
Beispiel #2
0
    TN_QLearning_rewards = []
    env = Cliff()
    for i in range(num_runs):
        # Create agent
        TN_QLearning = TabularNStepQLearning(env.state_shape,
                                             env.num_actions,
                                             n=n)

        # Run training loop
        _, rewards = run_loop(env, TN_QLearning,
                              str(n) + '-step QLearning, run: ' + str(i))
        TN_QLearning_rewards.append(rewards)
    TN_QLearning_rewards = np.array(TN_QLearning_rewards)

    # Run the last QLearning agent using visualizations.
    # Try running this a couple of times
    run_loop(env, TN_QLearning, 'QLearning, n=' + str(n), max_e=1, render=True)

    # Plot the rewards
    plt.figure()
    include_sd = False  # include standard deviation in plot
    QL_utils.reward_plotter(TN_QLearning_rewards,
                            'QLearning',
                            'r',
                            include_sd=include_sd,
                            smooth_factor=2)

    axes = plt.gca()
    axes.set_ylim([-100, 0])
    plt.show()