Esempio n. 1
0
def run_experiment(params, optimize_hyperparams):
    """
    Main experiment function
    """
    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10)

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
Esempio n. 2
0
def test_discount_optimization():
    seeding.set_global_seed(42)

    class ValueIterationAgentToOptimize(ValueIterationAgent):
        @classmethod
        def sample_parameters(cls, trial):
            """
            Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/)
            """
            gamma = trial.suggest_categorical('gamma', [0.1, 0.99])
            return {'gamma': gamma}

    env = GridWorld(nrows=3, ncols=10,
                    reward_at={(1, 1): 0.1, (2, 9): 1.0},
                    walls=((1, 4), (2, 4), (1, 5)),
                    success_probability=0.9)

    vi_params = {'gamma': 0.1, 'epsilon': 1e-3}

    vi_stats = AgentStats(ValueIterationAgentToOptimize, env, eval_horizon=20, init_kwargs=vi_params, n_fit=4, n_jobs=1)

    vi_stats.optimize_hyperparams(n_trials=5, timeout=30, n_sim=5, n_fit=1, n_jobs=1,
                                  sampler_method='random', pruner_method='none')

    assert vi_stats.best_hyperparams['gamma'] == 0.99
Esempio n. 3
0
def run_experiment(params,
                   optimize_hyperparams,
                   rlberry_seed):
    """
    Main experiment function
    """
    seeding.set_global_seed(rlberry_seed)

    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 2nd PPO thread
    # stats['ppo'].set_writer(1, None)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 1st A2C thread
    # stats['a2c'].set_writer(0, None)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2)

    # Fit with best hyperparams and save results
    for stats in agent_stats_list:
        stats.fit()
        stats.save_results()

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
Esempio n. 4
0
def test_hyperparam_optim_random():
    # Define train env
    train_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}

    # Run AgentStats
    stats_agent = AgentStats(DummyAgent, train_env, init_kwargs=params,
                             n_fit=4, eval_horizon=10, n_jobs=1)

    # test hyperparameter optimization with random sampler
    stats_agent.optimize_hyperparams(sampler_method="random")
Esempio n. 5
0
def test_hyperparam_optim_tpe():
    # Define trainenv
    train_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}

    # Run AgentStats
    stats_agent = AgentStats(DummyAgent, train_env, init_kwargs=params,
                             n_fit=4, eval_horizon=10, n_jobs=1)

    # test hyperparameter optimization with TPE sampler
    # using hyperopt default values
    sampler_kwargs = TPESampler.hyperopt_parameters()
    stats_agent.optimize_hyperparams(sampler_kwargs=sampler_kwargs)
Esempio n. 6
0
def test_agent_stats_2():
    # Define train and evaluation envs
    train_env = GridWorld()
    eval_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}

    # Run AgentStats
    stats_agent1 = AgentStats(DummyAgent,
                              train_env,
                              eval_env=eval_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10,
                              n_jobs=1)
    stats_agent2 = AgentStats(DummyAgent,
                              train_env,
                              eval_env=eval_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10,
                              n_jobs=1)
    agent_stats_list = [stats_agent1, stats_agent2]

    # set some writers
    stats_agent1.set_writer(1, None)
    stats_agent1.set_writer(2, None)

    # compare final policies
    compare_policies(agent_stats_list, n_sim=10, show=False)
    compare_policies(agent_stats_list,
                     n_sim=10,
                     show=False,
                     stationary_policy=False)

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # check if fitted
    for agent_stats in agent_stats_list:
        assert len(agent_stats.fitted_agents) == 4
        for agent in agent_stats.fitted_agents:
            assert agent.fitted

    # test saving/loading
    dirname = stats_agent1.output_dir
    fname = dirname / 'stats'
    stats_agent1.save()
    loaded_stats = AgentStats.load(fname)
    assert stats_agent1.identifier == loaded_stats.identifier

    # delete file
    os.remove(fname.with_suffix('.pickle'))
    dirname.rmdir()

    # test hyperparemeter optimization
    loaded_stats.optimize_hyperparams()
Esempio n. 7
0
def test_hyperparam_optim_grid():
    # Define train env
    train_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}

    # Run AgentStats
    stats_agent = AgentStats(DummyAgent, train_env, init_kwargs=params,
                             n_fit=4, eval_horizon=10, n_jobs=1)

    # test hyperparameter optimization with grid sampler
    search_space = {"hyperparameter1": [1, 2, 3],
                    "hyperparameter2": [-5, 0, 5]}
    sampler_kwargs = {"search_space": search_space}
    stats_agent.optimize_hyperparams(n_trials=3*3,
                                     sampler_method="grid",
                                     sampler_kwargs=sampler_kwargs)
Esempio n. 8
0
def test_agent_stats_1():
    # Define train and evaluation envs
    train_env = GridWorld()
    eval_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}
    horizon = 20

    # Check DummyAgent
    agent = DummyAgent(train_env, **params)
    agent.fit()
    agent.policy(None)

    # Run AgentStats
    stats_agent1 = AgentStats(DummyAgent,
                              train_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10)
    stats_agent2 = AgentStats(DummyAgent,
                              train_env,
                              init_kwargs=params,
                              n_fit=4,
                              eval_horizon=10)
    agent_stats_list = [stats_agent1, stats_agent2]

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    compare_policies(agent_stats_list,
                     eval_env,
                     eval_horizon=horizon,
                     n_sim=10,
                     show=False)
    compare_policies(agent_stats_list,
                     eval_env,
                     eval_horizon=horizon,
                     n_sim=10,
                     show=False,
                     stationary_policy=False)

    # check if fitted
    for agent_stats in agent_stats_list:
        assert len(agent_stats.fitted_agents) == 4
        for agent in agent_stats.fitted_agents:
            assert agent.fitted

    # test saving/loading
    stats_agent1.save('test_agent_stats_file.pickle')
    loaded_stats = AgentStats.load('test_agent_stats_file.pickle')
    assert stats_agent1.identifier == loaded_stats.identifier

    # delete file
    os.remove('test_agent_stats_file.pickle')

    # test hyperparemeter optimization
    loaded_stats.optimize_hyperparams()
    loaded_stats.optimize_hyperparams(continue_previous=True)
Esempio n. 9
0
def test_agent_stats_seeding():
    sd.set_global_seed(3456)
    for env in [MountainCar(), (gym_make, {'env_name': 'MountainCar-v0'})]:
        agent_stats = AgentStats(RSUCBVIAgent,
                                 env,
                                 init_kwargs={
                                     'n_episodes': 2,
                                     'horizon': 10
                                 },
                                 n_fit=6)
        agent_stats.fit()

        for ii in range(2, agent_stats.n_fit):
            traj1 = get_env_trajectory(agent_stats.fitted_agents[ii - 2].env,
                                       horizon=10)
            traj2 = get_env_trajectory(agent_stats.fitted_agents[ii - 1].env,
                                       horizon=10)
            traj3 = get_env_trajectory(agent_stats.fitted_agents[ii].env,
                                       horizon=10)
            assert not compare_trajectories(traj1, traj2)
            assert not compare_trajectories(traj1, traj3)
            assert not compare_trajectories(traj2, traj3)
Esempio n. 10
0
def test_agent_stats_partial_fit():
    # Define train and evaluation envs
    train_env = GridWorld()
    eval_env = GridWorld()

    # Parameters
    params = {"n_episodes": 500}
    horizon = 20

    # Check DummyAgent
    agent = DummyAgent(train_env, **params)
    agent.fit()
    agent.policy(None)

    # Run AgentStats
    stats = AgentStats(DummyAgent,
                       train_env,
                       init_kwargs=params,
                       n_fit=4,
                       eval_horizon=10)

    # Run partial fit
    stats.partial_fit(0.1)
    stats.partial_fit(0.5)
    for agent in stats.fitted_agents:
        assert agent.fraction_fitted == 0.6
    for _ in range(2):
        stats.partial_fit(0.5)
        for agent in stats.fitted_agents:
            assert agent.fraction_fitted == 1.0

    # learning curves
    plot_episode_rewards([stats], cumulative=True, show=False)

    # compare final policies
    compare_policies([stats],
                     eval_env,
                     eval_horizon=horizon,
                     n_sim=10,
                     show=False)
        break
env.close()

#
# Traning several agents and comparing different hyperparams
#
from rlberry.stats import AgentStats, MultipleStats, agent_stats, compare_policies

stats = AgentStats(
    A2CAgent,
    env,
    eval_horizon=200,
    agent_name='A2C baseline',
    init_kwargs={
        'policy': 'MlpPolicy',
        'verbose': 1
    },
    fit_kwargs={'total_timesteps': 1000},
    policy_kwargs={'deterministic': True},
    n_fit=4,
    n_jobs=4,
    joblib_backend='loky'
)  # we might need 'threading' here, since stable baselines creates processes
# 'multiprocessing' does not work, 'loky' seems good

stats_alternative = AgentStats(A2CAgent,
                               env,
                               eval_horizon=200,
                               agent_name='A2C high learning rate',
                               init_kwargs={
                                   'policy': 'MlpPolicy',
Esempio n. 12
0
    'gamma': 0.99
}

params_greedy = {
    'n_episodes': 500,
    'feature_map_fn': feature_map_fn,
    'horizon': 10,
    'bonus_scale_factor': 0.0,
    'gamma': 0.99
}

params_oracle = {'horizon': 10, 'gamma': 0.99}

stats = AgentStats(LSVIUCBAgent,
                   env,
                   eval_horizon=10,
                   init_kwargs=params,
                   n_fit=4)

stats_random = AgentStats(LSVIUCBAgent,
                          env,
                          eval_horizon=10,
                          init_kwargs=params_greedy,
                          n_fit=1,
                          agent_name='LSVI-random-expl')

oracle_stats = AgentStats(ValueIterationAgent,
                          env,
                          eval_horizon=10,
                          init_kwargs=params_oracle,
                          n_fit=1)
Esempio n. 13
0
    "bonus_scale_factor": BONUS_SCALE_FACTOR,
    "min_dist": MIN_DIST,
    "bandwidth": 0.1,
    "beta": 1.0,
    "kernel_type": "gaussian",
}

params_ppo = {"n_episodes": N_EPISODES,
              "gamma": GAMMA,
              "horizon": HORIZON,
              "learning_rate": 0.0003}

# -----------------------------
# Run AgentStats
# -----------------------------
rsucbvi_stats = AgentStats(RSUCBVIAgent, train_env,
                           init_kwargs=params, n_fit=4)
rskernel_stats = AgentStats(RSKernelUCBVIAgent, train_env,
                            init_kwargs=params_kernel, n_fit=4)
ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4)

agent_stats_list = [rsucbvi_stats, rskernel_stats, ppo_stats]

# learning curves
plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

# compare final policies
output = compare_policies(agent_stats_list, eval_env,
                          eval_horizon=HORIZON, n_sim=10)
print(output)
Esempio n. 14
0
BONUS_SCALE_FACTOR = 0.1
MIN_DIST = 0.1

params_ppo = {
    "n_episodes": N_EPISODES,
    "gamma": GAMMA,
    "horizon": HORIZON,
    "learning_rate": 0.0003
}

# -------------------------------
# Run AgentStats and save results
# --------------------------------
ppo_stats = AgentStats(PPOAgent,
                       train_env,
                       eval_horizon=HORIZON,
                       init_kwargs=params_ppo,
                       n_fit=4)

# hyperparam optim
best_trial, data = ppo_stats.optimize_hyperparams(
    n_trials=10,
    timeout=None,
    n_sim=5,
    n_fit=2,
    n_jobs=2,
    sampler_method='optuna_default')

initial_n_trials = len(ppo_stats.study.trials)

# save
Esempio n. 15
0
def test_agent_stats_partial_fit_and_tuple_env():
    # Define train and evaluation envs
    train_env = (GridWorld, None
                 )  # tuple (constructor, kwargs) must also work in AgentStats

    # Parameters
    params = {"n_episodes": 500}
    horizon = 20

    # Run AgentStats
    stats = AgentStats(DummyAgent,
                       train_env,
                       init_kwargs=params,
                       n_fit=4,
                       eval_horizon=10)
    stats2 = AgentStats(DummyAgent,
                        train_env,
                        init_kwargs=params,
                        n_fit=4,
                        eval_horizon=10)
    # set some writers
    stats.set_writer(0, None)
    stats.set_writer(3, None)

    # Run partial fit
    stats.partial_fit(0.1)
    stats.partial_fit(0.5)
    for agent in stats.fitted_agents:
        assert agent.fraction_fitted == 0.6
    for _ in range(2):
        stats.partial_fit(0.5)
        for agent in stats.fitted_agents:
            assert agent.fraction_fitted == 1.0

    # Run fit
    stats2.fit()

    # learning curves
    plot_episode_rewards([stats], cumulative=True, show=False)

    # compare final policies
    compare_policies([stats], eval_horizon=horizon, n_sim=10, show=False)
Esempio n. 16
0
params_ppo_bonus = {
              'n_episodes': N_EPISODES,
              'gamma': GAMMA,
              'horizon': HORIZON,
              'batch_size': 16,
              'entr_coef': 8e-7,
              'k_epochs': 10,
              'eps_clip': 0.2,
              'learning_rate': 0.03,
              'use_bonus': True,
              'uncertainty_estimator_kwargs': {
                  'uncertainty_estimator_fn': uncertainty_estimator_fn
              }
              }


# -----------------------------
# Run AgentStats
# -----------------------------
ppo_stats = AgentStats(PPOAgent, env, eval_env=eval_env, init_kwargs=params_ppo, n_fit=4, agent_name='PPO')
ppo_bonus_stats = AgentStats(PPOAgent, env, eval_env=eval_env, init_kwargs=params_ppo_bonus, n_fit=4, agent_name='PPO-Bonus')

agent_stats_list = [ppo_bonus_stats, ppo_stats]

# learning curves
plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

# compare final policies
output = compare_policies(agent_stats_list, eval_horizon=HORIZON, n_sim=20)
print(output)
def env_constructor(n_envs=4):
    env = make_atari_env('MontezumaRevenge-v0', n_envs=n_envs)
    env = VecFrameStack(env, n_stack=4)
    return env


#
# Traning several agents and comparing different hyperparams
#

stats = AgentStats(A2CAgent, (env_constructor, None),
                   eval_horizon=200,
                   agent_name='A2C baseline',
                   init_kwargs={
                       'policy': 'CnnPolicy',
                       'verbose': 10
                   },
                   fit_kwargs={'total_timesteps': 1000},
                   policy_kwargs={'deterministic': True},
                   n_fit=4,
                   n_jobs=4,
                   joblib_backend='threading')

stats_alternative = AgentStats(A2CAgent, (env_constructor, None),
                               eval_horizon=200,
                               agent_name='A2C high learning rate',
                               init_kwargs={
                                   'policy': 'CnnPolicy',
                                   'verbose': 10,
                                   'learning_rate': 0.01
                               },
                               fit_kwargs={'total_timesteps': 1000},
Esempio n. 18
0
def load_experiment_results(output_dir, experiment_name):
    """
    Parameters
    ----------
    output_dir : str or Path, or list
        directory (or list of directories) where experiment results are stored
        (command line argument --output_dir when running the eperiment)
    experiment_name : str or Path, or list
        name of yaml file describing the experiment.

    Returns
    -------
    output_data: dict
        dictionary such that

        output_data['experiment_dirs'] = list of paths to experiment directory (output_dir/experiment_name)
        output_data['agent_list'] = list containing the names of the agents in the experiment
        output_data['stats'][agent_name] = fitted AgentStats for agent_name
        output_data['dataframes'][agent_name] = dict of pandas data frames from the last run of the experiment
        output_data['data_dir'][agent_name] = directory from which the results were loaded
    """
    output_data = {}
    output_data['agent_list'] = []
    output_data['stats'] = {}
    output_data['dataframes'] = {}
    output_data['data_dir'] = {}

    # preprocess input
    if not isinstance(output_dir, list):
        output_dir = [output_dir]
    if not isinstance(experiment_name, list):
        experiment_name = [experiment_name]
    ndirs = len(output_dir)

    if ndirs > 1:
        assert len(
            experiment_name
        ) == ndirs, "Number of experiment names must match the number of output_dirs "
    else:
        output_dir = len(experiment_name) * output_dir

    results_dirs = []
    for dd, exper in zip(output_dir, experiment_name):
        results_dirs.append(Path(dd) / Path(exper).stem)
    output_data['experiment_dirs'] = results_dirs

    # Subdirectories with data for each agent
    subdirs = []
    for dd in results_dirs:
        subdirs.extend([f for f in dd.iterdir() if f.is_dir()])

    # Create dictionary dict[agent_name] = most recent result dir
    data_dirs = {}
    for dd in subdirs:
        data_dirs[dd.name] = _get_most_recent_path(
            [f for f in dd.iterdir() if f.is_dir()])

    # Load data from each subdir
    for agent_name in data_dirs:
        output_data['agent_list'].append(agent_name)

        # store data_dir
        output_data['data_dir'][agent_name] = data_dirs[agent_name]

        # store AgentStats
        output_data['stats'][agent_name] = None
        fname = data_dirs[agent_name] / 'stats.pickle'
        try:
            output_data['stats'][agent_name] = AgentStats.load(fname)
        except Exception:
            pass
        logger.info("... loaded " + str(fname))

        # store data frames
        dataframes = {}
        csv_files = [
            f for f in data_dirs[agent_name].iterdir() if f.suffix == '.csv'
        ]
        for ff in csv_files:
            dataframes[ff.stem] = pd.read_csv(ff)
            logger.info("... loaded " + str(ff))
        output_data['dataframes'][agent_name] = dataframes

    return output_data
env = GridWorld(nrows=5, ncols=10)

params = {}

params['ucbvi'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'stage_dependent': True,
    'gamma': GAMMA,
    'real_time_dp': True,
    'bonus_scale_factor': 1.0,
}

params['optql'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'gamma': GAMMA,
    'bonus_scale_factor': 1.0,
}

mstats = MultipleStats()

mstats.append(AgentStats(UCBVIAgent, env, init_kwargs=params['ucbvi']))

mstats.append(AgentStats(OptQLAgent, env, init_kwargs=params['optql']))

mstats.run()

plot_episode_rewards(mstats.allstats, cumulative=True)
Esempio n. 20
0
N_EPISODES = 100
GAMMA = 0.99
HORIZON = 50
BONUS_SCALE_FACTOR = 0.1
MIN_DIST = 0.1


params_ppo = {"n_episodes": N_EPISODES,
              "gamma": GAMMA,
              "horizon": HORIZON,
              "learning_rate": 0.0003}

# -------------------------------
# Run AgentStats and save results
# --------------------------------
ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4)
ppo_stats.fit()  # fit the 4 agents
ppo_stats.save('ppo_stats')
del ppo_stats

# -------------------------------
# Load and plot results
# --------------------------------
ppo_stats = AgentStats.load('ppo_stats')
agent_stats_list = [ppo_stats]

# learning curves
plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

# compare final policies
output = compare_policies(agent_stats_list, eval_env,
Esempio n. 21
0
    "horizon": HORIZON
}

params_ppo = {
    "n_episodes": N_EPISODES,
    "gamma": GAMMA,
    "horizon": HORIZON,
    "learning_rate": 0.0003
}

# -----------------------------
# Run AgentStats
# -----------------------------
oracle_stats = AgentStats(MBQVIAgent,
                          d_train_env,
                          init_kwargs=params_oracle,
                          n_fit=4,
                          agent_name="Oracle")
ppo_stats = AgentStats(PPOAgent,
                       train_env,
                       init_kwargs=params_ppo,
                       n_fit=4,
                       agent_name="PPO")

agent_stats_list = [oracle_stats, ppo_stats]

# learning curves
plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

# compare final policies
output = compare_policies(agent_stats_list, eval_horizon=HORIZON, n_sim=10)
Esempio n. 22
0
}

params['rsucbvi'] = {
    'n_episodes': N_EP,
    'horizon': HORIZON,
    'gamma': 1.0,
    'bonus_scale_factor': 1.0,
    'min_dist': 0.05,
    'max_repr': 800
}

mstats = MultipleStats()
mstats.append(
    AgentStats(AdaptiveQLAgent,
               env,
               init_kwargs=params['adaql'],
               n_fit=4,
               n_jobs=4)
)
mstats.append(
    AgentStats(RSUCBVIAgent, env, init_kwargs=params['rsucbvi'], n_fit=2)
)

mstats.run(save=False)

plot_episode_rewards(mstats.allstats, cumulative=True)

for stats in mstats.allstats:
    agent = stats.fitted_agents[0]
    try:
        agent.Qtree.plot(0, 25)
from rlberry.agents.ppo import PPOAgent
from rlberry.envs.benchmarks.ball_exploration import PBall2D
from rlberry.seeding import seeding
from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies

seeding.set_global_seed(1223)

env = PBall2D()
n_episodes = 400
horizon = 100

ppo_params = {}
ppo_params['n_episodes'] = 400
ppo_params['horizon'] = 100
ppo_params['gamma'] = 0.99
ppo_params['learning_rate'] = 0.001
ppo_params['eps_clip'] = 0.2
ppo_params['k_epochs'] = 4

ppo_stats = AgentStats(PPOAgent,
                       env,
                       eval_horizon=100,
                       init_kwargs=ppo_params,
                       n_fit=2)
ppo_stats.partial_fit(0.3)
plot_episode_rewards([ppo_stats], show=False, cumulative=True)
compare_policies([ppo_stats], show=False)
ppo_stats.partial_fit(0.2)
plot_episode_rewards([ppo_stats], show=False, cumulative=True)
compare_policies([ppo_stats], show=True)
# -----------------------------
N_EPISODES = 500
GAMMA = 0.99
HORIZON = 50

params_ppo = {
    "n_episodes": N_EPISODES,
    "gamma": GAMMA,
    "horizon": HORIZON,
    "learning_rate": 0.0003
}

# -----------------------------
# Run AgentStats
# -----------------------------
ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4)

ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={'comment': 'worker_0'})
ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={'comment': 'worker_1'})

agent_stats_list = [ppo_stats]

agent_stats_list[0].fit()
agent_stats_list[0].save(
)  # after fit, writers are set to None to avoid pickle problems.

# learning curves
plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

# compare final policies
output = compare_policies(agent_stats_list,
Esempio n. 25
0
def parse_experiment_config(
    path: Path,
    n_fit: int = 4,
    n_jobs: int = 4,
    output_base_dir: str = 'results',
    joblib_backend: str = 'loky'
) -> Generator[Tuple[int, AgentStats], None, None]:
    """
    Read .yaml files. set global seed and convert to AgentStats instances.

    Exemple of experiment config:

    ```experiment.yaml
        description: 'My cool experiment'
        seed: 42
        n_episodes: 1000
        horizon: 50
        train_env: 'env_train.yaml'     # see read_env_config()
        eval_env: 'env_eval.yaml'
        agents:
        - 'agent1.yaml'                 # see read_agent_config()
        - 'agent2.yaml'
    ```

    Parameters
    ----------
    path : Path
        Path to an experiment config
    n_fit : int
        Number of instances of each agent to fit
    n_jobs : int
        Number of parallel jobs
    output_base_dir : str
        Directory where to save AgentStats results.

    Returns
    -------
    seed: int
        global seed
    agent_stats: AgentStats
        the Agent Stats to fit
    """
    with path.open() as file:
        config = yaml.safe_load(file)
        train_env = read_env_config(config["train_env"])
        eval_env = read_env_config(config["eval_env"])
        n_fit = n_fit
        n_jobs = n_jobs

        for agent_path in config["agents"]:
            # set seed before creating AgentStats
            seed = config["seed"]
            set_global_seed(seed)

            agent_name = Path(agent_path).stem
            agent_class, agent_config = read_agent_config(agent_path)

            # Process output dir, avoid erasing previous results
            output_dir = Path(output_base_dir) / path.stem / agent_name
            last = 0

            try:
                subdirs = [f for f in output_dir.iterdir() if f.is_dir()]
            except FileNotFoundError:
                subdirs = []

            for dd in subdirs:
                try:
                    idx = int(dd.stem)
                except ValueError:
                    continue
                if idx > last:
                    last = idx

            # kwargs
            init_kwargs = agent_config['init_kwargs']
            fit_kwargs = agent_config['fit_kwargs']
            policy_kwargs = agent_config['policy_kwargs']

            # check if there are global kwargs
            if 'global_init_kwargs' in config:
                init_kwargs.update(config['global_init_kwargs'])
            if 'global_fit_kwargs' in config:
                init_kwargs.update(config['global_fit_kwargs'])
            if 'global_policy_kwargs' in config:
                init_kwargs.update(config['global_policy_kwargs'])

            # check eval_horizon
            if 'eval_horizon' in config:
                eval_horizon = config['eval_horizon']
            else:
                eval_horizon = None

            # append run index to dir
            output_dir = output_dir / str(last + 1)

            yield seed, AgentStats(agent_class=agent_class,
                                   init_kwargs=init_kwargs,
                                   fit_kwargs=fit_kwargs,
                                   policy_kwargs=policy_kwargs,
                                   agent_name=agent_name,
                                   train_env=train_env,
                                   eval_env=eval_env,
                                   eval_horizon=eval_horizon,
                                   n_fit=n_fit,
                                   n_jobs=n_jobs,
                                   output_dir=output_dir,
                                   joblib_backend=joblib_backend)