Ejemplo n.º 1
0
def test_agent_manager_partial_fit_and_tuple_env():
    # Define train and evaluation envs
    train_env = (
        GridWorld,
        None,
    )  # tuple (constructor, kwargs) must also work in AgentManager

    # Parameters
    params = {}
    eval_kwargs = dict(eval_horizon=10)

    # Run AgentManager
    stats = AgentManager(
        DummyAgent,
        train_env,
        init_kwargs=params,
        n_fit=4,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        seed=123,
    )
    stats2 = AgentManager(
        DummyAgent,
        train_env,
        init_kwargs=params,
        n_fit=4,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        seed=123,
    )

    # Run partial fit
    stats.fit(10)
    stats.fit(20)
    for agent in stats.agent_handlers:
        assert agent.total_budget == 30

    # Run fit
    stats2.fit()

    # learning curves
    plot_writer_data([stats],
                     tag="episode_rewards",
                     show=False,
                     preprocess_func=np.cumsum)

    # compare final policies
    evaluate_agents([stats], show=False)

    # delete some writers
    stats.set_writer(0, None)
    stats.set_writer(3, None)

    stats.clear_output_dir()
    stats2.clear_output_dir()
Ejemplo n.º 2
0
def test_plot_writer_data_with_directory_input(outdir_id_style):
    with tempfile.TemporaryDirectory() as tmpdirname:
        output_dir = tmpdirname + "/rlberry_data"
        manager = _create_and_fit_agent_manager(output_dir, outdir_id_style)
        del manager

        os.system("ls " + tmpdirname + "/rlberry_data/manager_data")

        #
        # Single directory
        #

        data_source = output_dir
        output = plot_writer_data(
            data_source,
            tag="reward",
            preprocess_func=_compute_reward,
            title="Cumulative Reward",
            show=False,
            savefig_fname=tmpdirname + "/test.png",
        )
        assert (os.path.getsize(tmpdirname + "/test.png") >
                1000), "plot_writer_data saved an empty image"
        assert len(output) > 1

        list_dirs = list(
            Path(tmpdirname + "/rlberry_data/manager_data").iterdir())
        list_dirs = [str(dir) for dir in list_dirs]

        #
        # List of directories
        #
        output_with_list_dirs = plot_writer_data(
            list_dirs,
            tag="reward",
            preprocess_func=_compute_reward,
            title="Cumulative Reward",
            show=False,
            savefig_fname=tmpdirname + "/test.png",
        )

        assert np.all(output.shape == output_with_list_dirs.shape)
Ejemplo n.º 3
0
def test_plot_writer_data_with_manager_input(outdir_id_style):
    with tempfile.TemporaryDirectory() as tmpdirname:
        output_dir = tmpdirname + "/rlberry_data"
        manager = _create_and_fit_agent_manager(output_dir, outdir_id_style)
        os.system("ls " + tmpdirname + "/rlberry_data/manager_data")

        # Plot of the cumulative reward
        data_source = manager
        output = plot_writer_data(
            data_source,
            tag="reward",
            preprocess_func=_compute_reward,
            title="Cumulative Reward",
            show=False,
            savefig_fname=tmpdirname + "/test.png",
        )
        assert (os.path.getsize(tmpdirname + "/test.png") >
                1000), "plot_writer_data saved an empty image"
        assert len(output) > 1
Ejemplo n.º 4
0
        PPOAgent,
        train_env,
        fit_budget=N_EPISODES,
        init_kwargs=params_ppo,
        eval_kwargs=eval_kwargs,
        n_fit=4,
        output_dir="dev/",
        parallelization="process",
    )
    ppo_stats.fit()  # fit the 4 agents
    ppo_stats_fname = ppo_stats.save()
    del ppo_stats

    # -------------------------------
    # Load and plot results
    # --------------------------------
    ppo_stats = AgentManager.load(ppo_stats_fname)

    # learning curves
    plot_writer_data(
        ppo_stats,
        tag="episode_rewards",
        preprocess_func=np.cumsum,
        title="Cumulative Rewards",
        show=False,
    )

    # compare final policies
    output = evaluate_agents([ppo_stats], n_simulations=15)
    print(output)
Ejemplo n.º 5
0
        max_workers=2,
    )

    agent_manager_list = [rsucbvi_stats, rskernel_stats, a2c_stats]

    for st in agent_manager_list:
        st.fit()

    # Fit RSUCBVI for 50 more episodes
    rsucbvi_stats.fit(budget=50)

    # learning curves
    plot_writer_data(
        agent_manager_list,
        tag="episode_rewards",
        preprocess_func=np.cumsum,
        title="cumulative rewards",
        show=False,
    )

    plot_writer_data(
        agent_manager_list, tag="episode_rewards", title="episode rewards", show=False
    )

    # compare final policies
    output = evaluate_agents(agent_manager_list)

    print(output)

    # uncomment to delete output directories
    # for st in agent_manager_list:
Ejemplo n.º 6
0
    # Oracle (optimal policy)
    oracle_stats = AgentManager(
        ValueIterationAgent,
        env,
        init_kwargs=params_oracle,
        fit_budget=n_episodes,
        eval_kwargs=eval_kwargs,
        n_fit=1,
    )

    # fit
    stats.fit()
    stats_ucbvi.fit()
    stats_random.fit()
    oracle_stats.fit()

    # visualize results
    plot_writer_data(
        [stats, stats_ucbvi, stats_random],
        tag="episode_rewards",
        preprocess_func=np.cumsum,
        title="Cumulative Rewards",
        show=False,
    )
    plot_writer_data([stats, stats_ucbvi, stats_random],
                     tag="dw_time_elapsed",
                     show=False)
    evaluate_agents([stats, stats_ucbvi, stats_random, oracle_stats],
                    n_simulations=20)
Ejemplo n.º 7
0
eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)

multimanagers = MultipleManagers()

multimanagers.append(
    AgentManager(
        UCBVIAgent,
        env,
        fit_budget=N_EP,
        init_kwargs=params["ucbvi"],
        eval_kwargs=eval_kwargs,
    ))

multimanagers.append(
    AgentManager(
        OptQLAgent,
        env,
        fit_budget=N_EP,
        init_kwargs=params["optql"],
        eval_kwargs=eval_kwargs,
    ))

multimanagers.run()

plot_writer_data(
    multimanagers.managers,
    tag="episode_rewards",
    preprocess_func=np.cumsum,
    title="Cumulative Rewards",
)
Ejemplo n.º 8
0
agent = AgentManager(
    UCBAgent,
    (env_ctor, env_kwargs),
    fit_budget=T,
    init_kwargs={"B": 2},
    n_fit=M,
    parallelization="process",
    mp_context="fork",
)
# these parameters should give parallel computing even in notebooks

# Agent training

agent.fit()


# Compute and plot (pseudo-)regret
def compute_pseudo_regret(actions):
    return np.cumsum(np.max(means) - means[actions.astype(int)])


fig = plt.figure(1, figsize=(5, 3))
ax = plt.gca()
output = plot_writer_data(
    [agent],
    tag="action",
    preprocess_func=compute_pseudo_regret,
    title="Cumulative Pseudo-Regret",
    ax=ax,
)
Ejemplo n.º 9
0
    agent.fit()


# Compute and plot regret
def compute_regret(rewards):
    return np.cumsum(np.max(means) - rewards)


# Compute and plot (pseudo-)regret
def compute_pseudo_regret(actions):
    return np.cumsum(np.max(means) - means[actions.astype(int)])


output = plot_writer_data(
    agents,
    tag="action",
    preprocess_func=compute_pseudo_regret,
    title="Cumulative Pseudo-Regret",
)

output = plot_writer_data(
    agents,
    tag="reward",
    preprocess_func=compute_regret,
    title="Cumulative Regret",
)


# Compute and plot number of times each arm was selected
def compute_na(actions, a):
    return np.cumsum(actions == a)
Ejemplo n.º 10
0
env = env_ctor(**env_kwargs)
agent = AgentManager(VIAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=3)

agent.fit(budget=10)
# comment the line above if you only want to load data from rlberry_data.


# We use the following preprocessing function to plot the cumulative reward.
def compute_reward(rewards):
    return np.cumsum(rewards)


# Plot of the cumulative reward.
output = plot_writer_data(
    agent, tag="reward", preprocess_func=compute_reward, title="Cumulative Reward"
)
# The output is for 500 global steps because it uses 10 fit_budget * horizon

# Log-Log plot :
fig, ax = plt.subplots(1, 1)
plot_writer_data(
    agent,
    tag="reward",
    preprocess_func=compute_reward,
    title="Cumulative Reward",
    ax=ax,
    show=False,  # necessary to customize axes
)
ax.set_xlim(100, 500)
ax.relim()
Ejemplo n.º 11
0
def test_agent_manager_1():
    # Define train and evaluation envs
    train_env = (GridWorld, {})

    # Parameters
    params = dict(hyperparameter1=-1, hyperparameter2=100)
    eval_kwargs = dict(eval_horizon=10)

    # Check DummyAgent
    agent = DummyAgent(train_env[0](**train_env[1]), **params)
    agent.fit(10)
    agent.policy(None)

    # Run AgentManager
    params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)]
    stats_agent1 = AgentManager(
        DummyAgent,
        train_env,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        init_kwargs=params,
        n_fit=4,
        seed=123,
        init_kwargs_per_instance=params_per_instance,
    )
    stats_agent2 = AgentManager(
        DummyAgent,
        train_env,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        init_kwargs=params,
        n_fit=4,
        seed=123,
    )
    agent_manager_list = [stats_agent1, stats_agent2]
    for st in agent_manager_list:
        st.fit()

    for ii, instance in enumerate(stats_agent1.agent_handlers):
        assert instance.hyperparameter1 == -1
        assert instance.hyperparameter2 == ii

    for ii, instance in enumerate(stats_agent2.agent_handlers):
        assert instance.hyperparameter1 == -1
        assert instance.hyperparameter2 == 100

    # learning curves
    plot_writer_data(agent_manager_list, tag="episode_rewards", show=False)

    # compare final policies
    evaluate_agents(agent_manager_list, show=False)

    # check if fitted
    for agent_manager in agent_manager_list:
        assert len(agent_manager.agent_handlers) == 4
        for agent in agent_manager.agent_handlers:
            assert agent.fitted

    # test saving/loading
    fname = stats_agent1.save()
    loaded_stats = AgentManager.load(fname)
    assert stats_agent1.unique_id == loaded_stats.unique_id

    # test hyperparameter optimization call
    loaded_stats.optimize_hyperparams(n_trials=5)
    loaded_stats.optimize_hyperparams(n_trials=5, continue_previous=True)

    for st in agent_manager_list:
        st.clear_output_dir()
Ejemplo n.º 12
0
def test_agent_manager_2():
    # Define train and evaluation envs
    train_env = (GridWorld, {})
    eval_env = (GridWorld, {})

    # Parameters
    params = {}
    eval_kwargs = dict(eval_horizon=10)

    # Run AgentManager
    stats_agent1 = AgentManager(
        DummyAgent,
        train_env,
        eval_env=eval_env,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        init_kwargs=params,
        n_fit=4,
        seed=123,
    )
    stats_agent2 = AgentManager(
        DummyAgent,
        train_env,
        eval_env=eval_env,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        init_kwargs=params,
        n_fit=4,
        seed=123,
    )
    agent_manager_list = [stats_agent1, stats_agent2]
    for st in agent_manager_list:
        st.fit()

    # compare final policies
    evaluate_agents(agent_manager_list, show=False)
    evaluate_agents(agent_manager_list, show=False)

    # learning curves
    plot_writer_data(agent_manager_list, tag="episode_rewards", show=False)

    # check if fitted
    for agent_manager in agent_manager_list:
        assert len(agent_manager.agent_handlers) == 4
        for agent in agent_manager.agent_handlers:
            assert agent.fitted

    # test saving/loading
    fname = stats_agent1.save()
    loaded_stats = AgentManager.load(fname)
    assert stats_agent1.unique_id == loaded_stats.unique_id

    # test hyperparemeter optimization
    loaded_stats.optimize_hyperparams(n_trials=5)

    # delete some writers
    stats_agent1.set_writer(1, None)
    stats_agent1.set_writer(2, None)

    stats_agent1.clear_output_dir()
    stats_agent2.clear_output_dir()
Ejemplo n.º 13
0
    enable_tensorboard=True,
)

# basic version
# env_kwargs = dict(id = "CartPole-v0")
# agent = AgentManager(SACAgent, (gym_make, env_kwargs), fit_budget=200, n_fit=1)

# # timothe's
# env = gym_make("CartPole-v0")
# agent = AgentManager(
#     SACAgent, (env.__class__, dict()), fit_budget=200, n_fit=1, enable_tensorboard=True,
# )

# Omar's
# env = gym_make("CartPole-v0")
# from copy import deepcopy
# def env_constructor():
#     return deepcopy(env)
# agent = AgentManager(
#     SACAgent, (env_constructor, dict()), fit_budget=200, n_fit=1, enable_tensorboard=True,
# )


agent.fit()

# Plot of the cumulative reward.
output = plot_writer_data(agent, tag="loss_q1", title="Loss q1")
output = plot_writer_data(agent, tag="loss_q2", title="Loss q2")
output = plot_writer_data(agent, tag="loss_v", title="Loss critic")
output = plot_writer_data(agent, tag="loss_act", title="Loss actor")
Ejemplo n.º 14
0
        init_kwargs=params,
        n_fit=n_fit,
        parallelization="process",
        agent_name="dqn",
    )

    stats_alternative = AgentManager(
        DQNAgent,
        env,
        fit_budget=fit_budget,
        eval_env=env,
        init_kwargs=params_alternative,
        n_fit=n_fit,
        parallelization="process",
        agent_name="dqn_smaller_net",
    )

    # fit everything in parallel
    multimanagers = MultipleManagers()
    multimanagers.append(stats)
    multimanagers.append(stats_alternative)
    multimanagers.run()

    plot_writer_data(multimanagers.managers, tag="episode_rewards", show=False)
    plot_writer_data(multimanagers.managers, tag="dw_time_elapsed", show=False)
    plot_writer_data(multimanagers.managers, tag="eval_rewards", show=False)
    plot_writer_data(multimanagers.managers, tag="q_loss")

    stats.save()
    stats.clear_output_dir()