def test_agent_stats_1(): # Define train and evaluation envs train_env = GridWorld() eval_env = GridWorld() # Parameters params = {"n_episodes": 500} horizon = 20 # Check DummyAgent agent = DummyAgent(train_env, **params) agent.fit() agent.policy(None) # Run AgentStats stats_agent1 = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) stats_agent2 = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) agent_stats_list = [stats_agent1, stats_agent2] # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies compare_policies(agent_stats_list, eval_env, eval_horizon=horizon, n_sim=10, show=False) compare_policies(agent_stats_list, eval_env, eval_horizon=horizon, n_sim=10, show=False, stationary_policy=False) # check if fitted for agent_stats in agent_stats_list: assert len(agent_stats.fitted_agents) == 4 for agent in agent_stats.fitted_agents: assert agent.fitted # test saving/loading stats_agent1.save('test_agent_stats_file.pickle') loaded_stats = AgentStats.load('test_agent_stats_file.pickle') assert stats_agent1.identifier == loaded_stats.identifier # delete file os.remove('test_agent_stats_file.pickle') # test hyperparemeter optimization loaded_stats.optimize_hyperparams() loaded_stats.optimize_hyperparams(continue_previous=True)
def test_agent_stats_2(): # Define train and evaluation envs train_env = GridWorld() eval_env = GridWorld() # Parameters params = {"n_episodes": 500} # Run AgentStats stats_agent1 = AgentStats(DummyAgent, train_env, eval_env=eval_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) stats_agent2 = AgentStats(DummyAgent, train_env, eval_env=eval_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) agent_stats_list = [stats_agent1, stats_agent2] # set some writers stats_agent1.set_writer(1, None) stats_agent1.set_writer(2, None) # compare final policies compare_policies(agent_stats_list, n_sim=10, show=False) compare_policies(agent_stats_list, n_sim=10, show=False, stationary_policy=False) # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # check if fitted for agent_stats in agent_stats_list: assert len(agent_stats.fitted_agents) == 4 for agent in agent_stats.fitted_agents: assert agent.fitted # test saving/loading dirname = stats_agent1.output_dir fname = dirname / 'stats' stats_agent1.save() loaded_stats = AgentStats.load(fname) assert stats_agent1.identifier == loaded_stats.identifier # delete file os.remove(fname.with_suffix('.pickle')) dirname.rmdir() # test hyperparemeter optimization loaded_stats.optimize_hyperparams()
def run_experiment(params, optimize_hyperparams): """ Main experiment function """ # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10) # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
def test_agent_stats_partial_fit_and_tuple_env(): # Define train and evaluation envs train_env = (GridWorld, None ) # tuple (constructor, kwargs) must also work in AgentStats # Parameters params = {"n_episodes": 500} horizon = 20 # Run AgentStats stats = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) stats2 = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) # set some writers stats.set_writer(0, None) stats.set_writer(3, None) # Run partial fit stats.partial_fit(0.1) stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 0.6 for _ in range(2): stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 1.0 # Run fit stats2.fit() # learning curves plot_episode_rewards([stats], cumulative=True, show=False) # compare final policies compare_policies([stats], eval_horizon=horizon, n_sim=10, show=False)
def test_agent_stats_partial_fit(): # Define train and evaluation envs train_env = GridWorld() eval_env = GridWorld() # Parameters params = {"n_episodes": 500} horizon = 20 # Check DummyAgent agent = DummyAgent(train_env, **params) agent.fit() agent.policy(None) # Run AgentStats stats = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) # Run partial fit stats.partial_fit(0.1) stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 0.6 for _ in range(2): stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 1.0 # learning curves plot_episode_rewards([stats], cumulative=True, show=False) # compare final policies compare_policies([stats], eval_env, eval_horizon=horizon, n_sim=10, show=False)
def run_experiment(params, optimize_hyperparams, rlberry_seed): """ Main experiment function """ seeding.set_global_seed(rlberry_seed) # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 2nd PPO thread # stats['ppo'].set_writer(1, None) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 1st A2C thread # stats['a2c'].set_writer(0, None) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2) # Fit with best hyperparams and save results for stats in agent_stats_list: stats.fit() stats.save_results() # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
params_ppo = { "n_episodes": N_EPISODES, "gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003 } # ----------------------------- # Run AgentStats # ----------------------------- ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4) ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={'comment': 'worker_0'}) ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={'comment': 'worker_1'}) agent_stats_list = [ppo_stats] agent_stats_list[0].fit() agent_stats_list[0].save( ) # after fit, writers are set to None to avoid pickle problems. # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, eval_env, eval_horizon=HORIZON, n_sim=10) print(output)
'n_episodes': 500, 'feature_map_fn': feature_map_fn, 'horizon': 10, 'bonus_scale_factor': 0.0, 'gamma': 0.99 } params_oracle = {'horizon': 10, 'gamma': 0.99} stats = AgentStats(LSVIUCBAgent, env, eval_horizon=10, init_kwargs=params, n_fit=4) stats_random = AgentStats(LSVIUCBAgent, env, eval_horizon=10, init_kwargs=params_greedy, n_fit=1, agent_name='LSVI-random-expl') oracle_stats = AgentStats(ValueIterationAgent, env, eval_horizon=10, init_kwargs=params_oracle, n_fit=1) plot_episode_rewards([stats, stats_random], cumulative=True, show=False) compare_policies([stats, stats_random, oracle_stats], show=True)
n_fit=4, n_jobs=4, joblib_backend='loky' ) # we might need 'threading' here, since stable baselines creates processes # 'multiprocessing' does not work, 'loky' seems good stats_alternative = AgentStats(A2CAgent, env, eval_horizon=200, agent_name='A2C high learning rate', init_kwargs={ 'policy': 'MlpPolicy', 'verbose': 1, 'learning_rate': 0.01 }, fit_kwargs={'total_timesteps': 1000}, policy_kwargs={'deterministic': True}, n_fit=4, n_jobs=4, joblib_backend='loky') # Fit everything in parallel mstats = MultipleStats() mstats.append(stats) mstats.append(stats_alternative) mstats.run() # Plot policy evaluation compare_policies(mstats.allstats)
from rlberry.agents.ppo import PPOAgent from rlberry.envs.benchmarks.ball_exploration import PBall2D from rlberry.seeding import seeding from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies seeding.set_global_seed(1223) env = PBall2D() n_episodes = 400 horizon = 100 ppo_params = {} ppo_params['n_episodes'] = 400 ppo_params['horizon'] = 100 ppo_params['gamma'] = 0.99 ppo_params['learning_rate'] = 0.001 ppo_params['eps_clip'] = 0.2 ppo_params['k_epochs'] = 4 ppo_stats = AgentStats(PPOAgent, env, eval_horizon=100, init_kwargs=ppo_params, n_fit=2) ppo_stats.partial_fit(0.3) plot_episode_rewards([ppo_stats], show=False, cumulative=True) compare_policies([ppo_stats], show=False) ppo_stats.partial_fit(0.2) plot_episode_rewards([ppo_stats], show=False, cumulative=True) compare_policies([ppo_stats], show=True)