Beispiel #1
0
def test_agent_stats_1():
    # Define train and evaluation envs
    train_env = GridWorld()
    eval_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}
    horizon = 20

    # Check DummyAgent
    agent = DummyAgent(train_env, **params)
    agent.fit()
    agent.policy(None)

    # Run AgentStats
    stats_agent1 = AgentStats(DummyAgent,
                              train_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10)
    stats_agent2 = AgentStats(DummyAgent,
                              train_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10)
    agent_stats_list = [stats_agent1, stats_agent2]

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    compare_policies(agent_stats_list,
                     eval_env,
                     eval_horizon=horizon,
                     n_sim=10,
                     show=False)
    compare_policies(agent_stats_list,
                     eval_env,
                     eval_horizon=horizon,
                     n_sim=10,
                     show=False,
                     stationary_policy=False)

    # check if fitted
    for agent_stats in agent_stats_list:
        assert len(agent_stats.fitted_agents) == 4
        for agent in agent_stats.fitted_agents:
            assert agent.fitted

    # test saving/loading
    stats_agent1.save('test_agent_stats_file.pickle')
    loaded_stats = AgentStats.load('test_agent_stats_file.pickle')
    assert stats_agent1.identifier == loaded_stats.identifier

    # delete file
    os.remove('test_agent_stats_file.pickle')

    # test hyperparemeter optimization
    loaded_stats.optimize_hyperparams()
    loaded_stats.optimize_hyperparams(continue_previous=True)
def run_experiment(params, optimize_hyperparams):
    """
    Main experiment function
    """
    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10)

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
def test_agent_stats_2():
    # Define train and evaluation envs
    train_env = GridWorld()
    eval_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}

    # Run AgentStats
    stats_agent1 = AgentStats(DummyAgent,
                              train_env,
                              eval_env=eval_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10,
                              n_jobs=1)
    stats_agent2 = AgentStats(DummyAgent,
                              train_env,
                              eval_env=eval_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10,
                              n_jobs=1)
    agent_stats_list = [stats_agent1, stats_agent2]

    # set some writers
    stats_agent1.set_writer(1, None)
    stats_agent1.set_writer(2, None)

    # compare final policies
    compare_policies(agent_stats_list, n_sim=10, show=False)
    compare_policies(agent_stats_list,
                     n_sim=10,
                     show=False,
                     stationary_policy=False)

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # check if fitted
    for agent_stats in agent_stats_list:
        assert len(agent_stats.fitted_agents) == 4
        for agent in agent_stats.fitted_agents:
            assert agent.fitted

    # test saving/loading
    dirname = stats_agent1.output_dir
    fname = dirname / 'stats'
    stats_agent1.save()
    loaded_stats = AgentStats.load(fname)
    assert stats_agent1.identifier == loaded_stats.identifier

    # delete file
    os.remove(fname.with_suffix('.pickle'))
    dirname.rmdir()

    # test hyperparemeter optimization
    loaded_stats.optimize_hyperparams()
Beispiel #4
0
def run_experiment(params,
                   optimize_hyperparams,
                   rlberry_seed):
    """
    Main experiment function
    """
    seeding.set_global_seed(rlberry_seed)

    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 2nd PPO thread
    # stats['ppo'].set_writer(1, None)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 1st A2C thread
    # stats['a2c'].set_writer(0, None)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2)

    # Fit with best hyperparams and save results
    for stats in agent_stats_list:
        stats.fit()
        stats.save_results()

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
def test_agent_stats_partial_fit_and_tuple_env():
    # Define train and evaluation envs
    train_env = (GridWorld, None
                 )  # tuple (constructor, kwargs) must also work in AgentStats

    # Parameters
    params = {"n_episodes": 500}
    horizon = 20

    # Run AgentStats
    stats = AgentStats(DummyAgent,
                       train_env,
                       init_kwargs=params,
                       n_fit=4,
                       eval_horizon=10)
    stats2 = AgentStats(DummyAgent,
                        train_env,
                        init_kwargs=params,
                        n_fit=4,
                        eval_horizon=10)
    # set some writers
    stats.set_writer(0, None)
    stats.set_writer(3, None)

    # Run partial fit
    stats.partial_fit(0.1)
    stats.partial_fit(0.5)
    for agent in stats.fitted_agents:
        assert agent.fraction_fitted == 0.6
    for _ in range(2):
        stats.partial_fit(0.5)
        for agent in stats.fitted_agents:
            assert agent.fraction_fitted == 1.0

    # Run fit
    stats2.fit()

    # learning curves
    plot_episode_rewards([stats], cumulative=True, show=False)

    # compare final policies
    compare_policies([stats], eval_horizon=horizon, n_sim=10, show=False)
Beispiel #6
0
def test_agent_stats_partial_fit():
    # Define train and evaluation envs
    train_env = GridWorld()
    eval_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}
    horizon = 20

    # Check DummyAgent
    agent = DummyAgent(train_env, **params)
    agent.fit()
    agent.policy(None)

    # Run AgentStats
    stats = AgentStats(DummyAgent,
                       train_env,
                       init_kwargs=params,
                       n_fit=4,
                       eval_horizon=10)

    # Run partial fit
    stats.partial_fit(0.1)
    stats.partial_fit(0.5)
    for agent in stats.fitted_agents:
        assert agent.fraction_fitted == 0.6
    for _ in range(2):
        stats.partial_fit(0.5)
        for agent in stats.fitted_agents:
            assert agent.fraction_fitted == 1.0

    # learning curves
    plot_episode_rewards([stats], cumulative=True, show=False)

    # compare final policies
    compare_policies([stats],
                     eval_env,
                     eval_horizon=horizon,
                     n_sim=10,
                     show=False)
params_ppo = {
    "n_episodes": N_EPISODES,
    "gamma": GAMMA,
    "horizon": HORIZON,
    "learning_rate": 0.0003
}

# -----------------------------
# Run AgentStats
# -----------------------------
ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4)

ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={'comment': 'worker_0'})
ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={'comment': 'worker_1'})

agent_stats_list = [ppo_stats]

agent_stats_list[0].fit()
agent_stats_list[0].save(
)  # after fit, writers are set to None to avoid pickle problems.

# learning curves
plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

# compare final policies
output = compare_policies(agent_stats_list,
                          eval_env,
                          eval_horizon=HORIZON,
                          n_sim=10)
print(output)
Beispiel #8
0
    'n_episodes': 500,
    'feature_map_fn': feature_map_fn,
    'horizon': 10,
    'bonus_scale_factor': 0.0,
    'gamma': 0.99
}

params_oracle = {'horizon': 10, 'gamma': 0.99}

stats = AgentStats(LSVIUCBAgent,
                   env,
                   eval_horizon=10,
                   init_kwargs=params,
                   n_fit=4)

stats_random = AgentStats(LSVIUCBAgent,
                          env,
                          eval_horizon=10,
                          init_kwargs=params_greedy,
                          n_fit=1,
                          agent_name='LSVI-random-expl')

oracle_stats = AgentStats(ValueIterationAgent,
                          env,
                          eval_horizon=10,
                          init_kwargs=params_oracle,
                          n_fit=1)

plot_episode_rewards([stats, stats_random], cumulative=True, show=False)
compare_policies([stats, stats_random, oracle_stats], show=True)
env = GridWorld(nrows=5, ncols=10)

params = {}

params['ucbvi'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'stage_dependent': True,
    'gamma': GAMMA,
    'real_time_dp': True,
    'bonus_scale_factor': 1.0,
}

params['optql'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'gamma': GAMMA,
    'bonus_scale_factor': 1.0,
}

mstats = MultipleStats()

mstats.append(AgentStats(UCBVIAgent, env, init_kwargs=params['ucbvi']))

mstats.append(AgentStats(OptQLAgent, env, init_kwargs=params['optql']))

mstats.run()

plot_episode_rewards(mstats.allstats, cumulative=True)
from rlberry.agents.ppo import PPOAgent
from rlberry.envs.benchmarks.ball_exploration import PBall2D
from rlberry.seeding import seeding
from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies

seeding.set_global_seed(1223)

env = PBall2D()
n_episodes = 400
horizon = 100

ppo_params = {}
ppo_params['n_episodes'] = 400
ppo_params['horizon'] = 100
ppo_params['gamma'] = 0.99
ppo_params['learning_rate'] = 0.001
ppo_params['eps_clip'] = 0.2
ppo_params['k_epochs'] = 4

ppo_stats = AgentStats(PPOAgent,
                       env,
                       eval_horizon=100,
                       init_kwargs=ppo_params,
                       n_fit=2)
ppo_stats.partial_fit(0.3)
plot_episode_rewards([ppo_stats], show=False, cumulative=True)
compare_policies([ppo_stats], show=False)
ppo_stats.partial_fit(0.2)
plot_episode_rewards([ppo_stats], show=False, cumulative=True)
compare_policies([ppo_stats], show=True)