def test_agent_manager_partial_fit_and_tuple_env(): # Define train and evaluation envs train_env = ( GridWorld, None, ) # tuple (constructor, kwargs) must also work in AgentManager # Parameters params = {} eval_kwargs = dict(eval_horizon=10) # Run AgentManager stats = AgentManager( DummyAgent, train_env, init_kwargs=params, n_fit=4, fit_budget=5, eval_kwargs=eval_kwargs, seed=123, ) stats2 = AgentManager( DummyAgent, train_env, init_kwargs=params, n_fit=4, fit_budget=5, eval_kwargs=eval_kwargs, seed=123, ) # Run partial fit stats.fit(10) stats.fit(20) for agent in stats.agent_handlers: assert agent.total_budget == 30 # Run fit stats2.fit() # learning curves plot_writer_data([stats], tag="episode_rewards", show=False, preprocess_func=np.cumsum) # compare final policies evaluate_agents([stats], show=False) # delete some writers stats.set_writer(0, None) stats.set_writer(3, None) stats.clear_output_dir() stats2.clear_output_dir()
PPOAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, output_dir="dev/", parallelization="process", ) ppo_stats.fit() # fit the 4 agents ppo_stats_fname = ppo_stats.save() del ppo_stats # ------------------------------- # Load and plot results # -------------------------------- ppo_stats = AgentManager.load(ppo_stats_fname) # learning curves plot_writer_data( ppo_stats, tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", show=False, ) # compare final policies output = evaluate_agents([ppo_stats], n_simulations=15) print(output)
eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) # ----------------------------- # Run AgentManager # ----------------------------- ppo_stats = AgentManager( PPOAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, ) ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={"comment": "worker_0"}) ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={"comment": "worker_1"}) agent_manager_list = [ppo_stats] agent_manager_list[0].fit() agent_manager_list[0].save( ) # after fit, writers are set to None to avoid pickle problems. # compare final policies output = evaluate_agents(agent_manager_list) print(output)
stats_alternative.optimize_hyperparams( timeout=600, n_optuna_workers=2, n_fit=2, optuna_parallelization="process", fit_fraction=1.0, ) # Fit everything in parallel multimanagers = MultipleManagers() multimanagers.append(stats) multimanagers.append(stats_alternative) multimanagers.run() # Plot policy evaluation out = evaluate_agents(multimanagers.managers) print(out) # Visualize policy env = stats_alternative.build_eval_env() agent = stats_alternative.agent_handlers[0] obs = env.reset() for i in range(2500): action = agent.policy(obs) obs, reward, done, info = env.step(action) env.render() if done: break env.close()
# Oracle (optimal policy) oracle_stats = AgentManager( ValueIterationAgent, env, init_kwargs=params_oracle, fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=1, ) # fit stats.fit() stats_ucbvi.fit() stats_random.fit() oracle_stats.fit() # visualize results plot_writer_data( [stats, stats_ucbvi, stats_random], tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", show=False, ) plot_writer_data([stats, stats_ucbvi, stats_random], tag="dw_time_elapsed", show=False) evaluate_agents([stats, stats_ucbvi, stats_random, oracle_stats], n_simulations=20)
def test_agent_manager_1(): # Define train and evaluation envs train_env = (GridWorld, {}) # Parameters params = dict(hyperparameter1=-1, hyperparameter2=100) eval_kwargs = dict(eval_horizon=10) # Check DummyAgent agent = DummyAgent(train_env[0](**train_env[1]), **params) agent.fit(10) agent.policy(None) # Run AgentManager params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)] stats_agent1 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance, ) stats_agent2 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, ) agent_manager_list = [stats_agent1, stats_agent2] for st in agent_manager_list: st.fit() for ii, instance in enumerate(stats_agent1.agent_handlers): assert instance.hyperparameter1 == -1 assert instance.hyperparameter2 == ii for ii, instance in enumerate(stats_agent2.agent_handlers): assert instance.hyperparameter1 == -1 assert instance.hyperparameter2 == 100 # learning curves plot_writer_data(agent_manager_list, tag="episode_rewards", show=False) # compare final policies evaluate_agents(agent_manager_list, show=False) # check if fitted for agent_manager in agent_manager_list: assert len(agent_manager.agent_handlers) == 4 for agent in agent_manager.agent_handlers: assert agent.fitted # test saving/loading fname = stats_agent1.save() loaded_stats = AgentManager.load(fname) assert stats_agent1.unique_id == loaded_stats.unique_id # test hyperparameter optimization call loaded_stats.optimize_hyperparams(n_trials=5) loaded_stats.optimize_hyperparams(n_trials=5, continue_previous=True) for st in agent_manager_list: st.clear_output_dir()
def test_agent_manager_2(): # Define train and evaluation envs train_env = (GridWorld, {}) eval_env = (GridWorld, {}) # Parameters params = {} eval_kwargs = dict(eval_horizon=10) # Run AgentManager stats_agent1 = AgentManager( DummyAgent, train_env, eval_env=eval_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, ) stats_agent2 = AgentManager( DummyAgent, train_env, eval_env=eval_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, ) agent_manager_list = [stats_agent1, stats_agent2] for st in agent_manager_list: st.fit() # compare final policies evaluate_agents(agent_manager_list, show=False) evaluate_agents(agent_manager_list, show=False) # learning curves plot_writer_data(agent_manager_list, tag="episode_rewards", show=False) # check if fitted for agent_manager in agent_manager_list: assert len(agent_manager.agent_handlers) == 4 for agent in agent_manager.agent_handlers: assert agent.fitted # test saving/loading fname = stats_agent1.save() loaded_stats = AgentManager.load(fname) assert stats_agent1.unique_id == loaded_stats.unique_id # test hyperparemeter optimization loaded_stats.optimize_hyperparams(n_trials=5) # delete some writers stats_agent1.set_writer(1, None) stats_agent1.set_writer(2, None) stats_agent1.clear_output_dir() stats_agent2.clear_output_dir()
ppo_params["k_epochs"] = 4 eval_kwargs = dict(eval_horizon=horizon, n_simulations=20) ppo_stats = AgentManager( PPOAgent, env, fit_budget=n_episodes, eval_kwargs=eval_kwargs, init_kwargs=ppo_params, n_fit=2, ) ppo_stats.fit(n_episodes // 2) plot_writer_data( ppo_stats, tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", show=False, ) evaluate_agents([ppo_stats], show=False) ppo_stats.fit(n_episodes // 4) plot_writer_data( ppo_stats, tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", show=False, ) evaluate_agents([ppo_stats], show=True)