def test_agent_manager_partial_fit_and_tuple_env(): # Define train and evaluation envs train_env = ( GridWorld, None, ) # tuple (constructor, kwargs) must also work in AgentManager # Parameters params = {} eval_kwargs = dict(eval_horizon=10) # Run AgentManager stats = AgentManager( DummyAgent, train_env, init_kwargs=params, n_fit=4, fit_budget=5, eval_kwargs=eval_kwargs, seed=123, ) stats2 = AgentManager( DummyAgent, train_env, init_kwargs=params, n_fit=4, fit_budget=5, eval_kwargs=eval_kwargs, seed=123, ) # Run partial fit stats.fit(10) stats.fit(20) for agent in stats.agent_handlers: assert agent.total_budget == 30 # Run fit stats2.fit() # learning curves plot_writer_data([stats], tag="episode_rewards", show=False, preprocess_func=np.cumsum) # compare final policies evaluate_agents([stats], show=False) # delete some writers stats.set_writer(0, None) stats.set_writer(3, None) stats.clear_output_dir() stats2.clear_output_dir()
def test_plot_writer_data_with_directory_input(outdir_id_style): with tempfile.TemporaryDirectory() as tmpdirname: output_dir = tmpdirname + "/rlberry_data" manager = _create_and_fit_agent_manager(output_dir, outdir_id_style) del manager os.system("ls " + tmpdirname + "/rlberry_data/manager_data") # # Single directory # data_source = output_dir output = plot_writer_data( data_source, tag="reward", preprocess_func=_compute_reward, title="Cumulative Reward", show=False, savefig_fname=tmpdirname + "/test.png", ) assert (os.path.getsize(tmpdirname + "/test.png") > 1000), "plot_writer_data saved an empty image" assert len(output) > 1 list_dirs = list( Path(tmpdirname + "/rlberry_data/manager_data").iterdir()) list_dirs = [str(dir) for dir in list_dirs] # # List of directories # output_with_list_dirs = plot_writer_data( list_dirs, tag="reward", preprocess_func=_compute_reward, title="Cumulative Reward", show=False, savefig_fname=tmpdirname + "/test.png", ) assert np.all(output.shape == output_with_list_dirs.shape)
def test_plot_writer_data_with_manager_input(outdir_id_style): with tempfile.TemporaryDirectory() as tmpdirname: output_dir = tmpdirname + "/rlberry_data" manager = _create_and_fit_agent_manager(output_dir, outdir_id_style) os.system("ls " + tmpdirname + "/rlberry_data/manager_data") # Plot of the cumulative reward data_source = manager output = plot_writer_data( data_source, tag="reward", preprocess_func=_compute_reward, title="Cumulative Reward", show=False, savefig_fname=tmpdirname + "/test.png", ) assert (os.path.getsize(tmpdirname + "/test.png") > 1000), "plot_writer_data saved an empty image" assert len(output) > 1
PPOAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, output_dir="dev/", parallelization="process", ) ppo_stats.fit() # fit the 4 agents ppo_stats_fname = ppo_stats.save() del ppo_stats # ------------------------------- # Load and plot results # -------------------------------- ppo_stats = AgentManager.load(ppo_stats_fname) # learning curves plot_writer_data( ppo_stats, tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", show=False, ) # compare final policies output = evaluate_agents([ppo_stats], n_simulations=15) print(output)
max_workers=2, ) agent_manager_list = [rsucbvi_stats, rskernel_stats, a2c_stats] for st in agent_manager_list: st.fit() # Fit RSUCBVI for 50 more episodes rsucbvi_stats.fit(budget=50) # learning curves plot_writer_data( agent_manager_list, tag="episode_rewards", preprocess_func=np.cumsum, title="cumulative rewards", show=False, ) plot_writer_data( agent_manager_list, tag="episode_rewards", title="episode rewards", show=False ) # compare final policies output = evaluate_agents(agent_manager_list) print(output) # uncomment to delete output directories # for st in agent_manager_list:
# Oracle (optimal policy) oracle_stats = AgentManager( ValueIterationAgent, env, init_kwargs=params_oracle, fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=1, ) # fit stats.fit() stats_ucbvi.fit() stats_random.fit() oracle_stats.fit() # visualize results plot_writer_data( [stats, stats_ucbvi, stats_random], tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", show=False, ) plot_writer_data([stats, stats_ucbvi, stats_random], tag="dw_time_elapsed", show=False) evaluate_agents([stats, stats_ucbvi, stats_random, oracle_stats], n_simulations=20)
eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) multimanagers = MultipleManagers() multimanagers.append( AgentManager( UCBVIAgent, env, fit_budget=N_EP, init_kwargs=params["ucbvi"], eval_kwargs=eval_kwargs, )) multimanagers.append( AgentManager( OptQLAgent, env, fit_budget=N_EP, init_kwargs=params["optql"], eval_kwargs=eval_kwargs, )) multimanagers.run() plot_writer_data( multimanagers.managers, tag="episode_rewards", preprocess_func=np.cumsum, title="Cumulative Rewards", )
agent = AgentManager( UCBAgent, (env_ctor, env_kwargs), fit_budget=T, init_kwargs={"B": 2}, n_fit=M, parallelization="process", mp_context="fork", ) # these parameters should give parallel computing even in notebooks # Agent training agent.fit() # Compute and plot (pseudo-)regret def compute_pseudo_regret(actions): return np.cumsum(np.max(means) - means[actions.astype(int)]) fig = plt.figure(1, figsize=(5, 3)) ax = plt.gca() output = plot_writer_data( [agent], tag="action", preprocess_func=compute_pseudo_regret, title="Cumulative Pseudo-Regret", ax=ax, )
agent.fit() # Compute and plot regret def compute_regret(rewards): return np.cumsum(np.max(means) - rewards) # Compute and plot (pseudo-)regret def compute_pseudo_regret(actions): return np.cumsum(np.max(means) - means[actions.astype(int)]) output = plot_writer_data( agents, tag="action", preprocess_func=compute_pseudo_regret, title="Cumulative Pseudo-Regret", ) output = plot_writer_data( agents, tag="reward", preprocess_func=compute_regret, title="Cumulative Regret", ) # Compute and plot number of times each arm was selected def compute_na(actions, a): return np.cumsum(actions == a)
env = env_ctor(**env_kwargs) agent = AgentManager(VIAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=3) agent.fit(budget=10) # comment the line above if you only want to load data from rlberry_data. # We use the following preprocessing function to plot the cumulative reward. def compute_reward(rewards): return np.cumsum(rewards) # Plot of the cumulative reward. output = plot_writer_data( agent, tag="reward", preprocess_func=compute_reward, title="Cumulative Reward" ) # The output is for 500 global steps because it uses 10 fit_budget * horizon # Log-Log plot : fig, ax = plt.subplots(1, 1) plot_writer_data( agent, tag="reward", preprocess_func=compute_reward, title="Cumulative Reward", ax=ax, show=False, # necessary to customize axes ) ax.set_xlim(100, 500) ax.relim()
def test_agent_manager_1(): # Define train and evaluation envs train_env = (GridWorld, {}) # Parameters params = dict(hyperparameter1=-1, hyperparameter2=100) eval_kwargs = dict(eval_horizon=10) # Check DummyAgent agent = DummyAgent(train_env[0](**train_env[1]), **params) agent.fit(10) agent.policy(None) # Run AgentManager params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)] stats_agent1 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance, ) stats_agent2 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, ) agent_manager_list = [stats_agent1, stats_agent2] for st in agent_manager_list: st.fit() for ii, instance in enumerate(stats_agent1.agent_handlers): assert instance.hyperparameter1 == -1 assert instance.hyperparameter2 == ii for ii, instance in enumerate(stats_agent2.agent_handlers): assert instance.hyperparameter1 == -1 assert instance.hyperparameter2 == 100 # learning curves plot_writer_data(agent_manager_list, tag="episode_rewards", show=False) # compare final policies evaluate_agents(agent_manager_list, show=False) # check if fitted for agent_manager in agent_manager_list: assert len(agent_manager.agent_handlers) == 4 for agent in agent_manager.agent_handlers: assert agent.fitted # test saving/loading fname = stats_agent1.save() loaded_stats = AgentManager.load(fname) assert stats_agent1.unique_id == loaded_stats.unique_id # test hyperparameter optimization call loaded_stats.optimize_hyperparams(n_trials=5) loaded_stats.optimize_hyperparams(n_trials=5, continue_previous=True) for st in agent_manager_list: st.clear_output_dir()
def test_agent_manager_2(): # Define train and evaluation envs train_env = (GridWorld, {}) eval_env = (GridWorld, {}) # Parameters params = {} eval_kwargs = dict(eval_horizon=10) # Run AgentManager stats_agent1 = AgentManager( DummyAgent, train_env, eval_env=eval_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, ) stats_agent2 = AgentManager( DummyAgent, train_env, eval_env=eval_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, ) agent_manager_list = [stats_agent1, stats_agent2] for st in agent_manager_list: st.fit() # compare final policies evaluate_agents(agent_manager_list, show=False) evaluate_agents(agent_manager_list, show=False) # learning curves plot_writer_data(agent_manager_list, tag="episode_rewards", show=False) # check if fitted for agent_manager in agent_manager_list: assert len(agent_manager.agent_handlers) == 4 for agent in agent_manager.agent_handlers: assert agent.fitted # test saving/loading fname = stats_agent1.save() loaded_stats = AgentManager.load(fname) assert stats_agent1.unique_id == loaded_stats.unique_id # test hyperparemeter optimization loaded_stats.optimize_hyperparams(n_trials=5) # delete some writers stats_agent1.set_writer(1, None) stats_agent1.set_writer(2, None) stats_agent1.clear_output_dir() stats_agent2.clear_output_dir()
enable_tensorboard=True, ) # basic version # env_kwargs = dict(id = "CartPole-v0") # agent = AgentManager(SACAgent, (gym_make, env_kwargs), fit_budget=200, n_fit=1) # # timothe's # env = gym_make("CartPole-v0") # agent = AgentManager( # SACAgent, (env.__class__, dict()), fit_budget=200, n_fit=1, enable_tensorboard=True, # ) # Omar's # env = gym_make("CartPole-v0") # from copy import deepcopy # def env_constructor(): # return deepcopy(env) # agent = AgentManager( # SACAgent, (env_constructor, dict()), fit_budget=200, n_fit=1, enable_tensorboard=True, # ) agent.fit() # Plot of the cumulative reward. output = plot_writer_data(agent, tag="loss_q1", title="Loss q1") output = plot_writer_data(agent, tag="loss_q2", title="Loss q2") output = plot_writer_data(agent, tag="loss_v", title="Loss critic") output = plot_writer_data(agent, tag="loss_act", title="Loss actor")
init_kwargs=params, n_fit=n_fit, parallelization="process", agent_name="dqn", ) stats_alternative = AgentManager( DQNAgent, env, fit_budget=fit_budget, eval_env=env, init_kwargs=params_alternative, n_fit=n_fit, parallelization="process", agent_name="dqn_smaller_net", ) # fit everything in parallel multimanagers = MultipleManagers() multimanagers.append(stats) multimanagers.append(stats_alternative) multimanagers.run() plot_writer_data(multimanagers.managers, tag="episode_rewards", show=False) plot_writer_data(multimanagers.managers, tag="dw_time_elapsed", show=False) plot_writer_data(multimanagers.managers, tag="eval_rewards", show=False) plot_writer_data(multimanagers.managers, tag="q_loss") stats.save() stats.clear_output_dir()