def _fit_agent_manager(agent, env="continuous_state", init_kwargs=None): """ Check that the agent is compatible with :class:`~rlberry.manager.AgentManager`. Parameters ---------- agent: rlberry agent module Agent class to test. env: tuple (env_ctor, env_kwargs) or str in {"continuous_state", "discrete_state"}, default="continuous_state" if tuple, env is the constructor and keywords of the env on which to test. if str in {"continuous_state", "discrete_state"}, we use a default Benchmark environment. init_kwargs : dict Arguments required by the agent's constructor. """ if init_kwargs is None: init_kwargs = {} train_env = _make_env(env) try: agent = AgentManager( agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs ) agent.fit() except Exception as exc: raise RuntimeError("Agent not compatible with Agent Manager") from exc return agent
def test_agent_manager_and_multiple_managers_seeding(env, agent_class): agent_manager = AgentManager(agent_class, env, fit_budget=2, init_kwargs={"horizon": 10}, n_fit=6, seed=3456) agent_manager_test = AgentManager(agent_class, env, fit_budget=2, init_kwargs={"horizon": 10}, n_fit=6, seed=3456) multimanagers = MultipleManagers() multimanagers.append(agent_manager) multimanagers.append(agent_manager_test) multimanagers.run() stats1, stats2 = multimanagers.managers for ii in range(2, agent_manager.n_fit): traj1 = get_env_trajectory(stats1.agent_handlers[ii - 2].env, horizon=10) traj2 = get_env_trajectory(stats1.agent_handlers[ii - 1].env, horizon=10) traj3 = get_env_trajectory(stats1.agent_handlers[ii].env, horizon=10) traj1_test = get_env_trajectory(stats2.agent_handlers[ii - 2].env, horizon=10) traj2_test = get_env_trajectory(stats2.agent_handlers[ii - 1].env, horizon=10) traj3_test = get_env_trajectory(stats2.agent_handlers[ii].env, horizon=10) assert not compare_trajectories(traj1, traj2) assert not compare_trajectories(traj1, traj3) assert not compare_trajectories(traj2, traj3) assert compare_trajectories(traj1, traj1_test) assert compare_trajectories(traj2, traj2_test) assert compare_trajectories(traj3, traj3_test) for ii in range(2, agent_manager.n_fit): rand1 = stats1.agent_handlers[ii - 2].seeder.rng.integers(2**32) rand2 = stats1.agent_handlers[ii - 1].seeder.rng.integers(2**32) rand3 = stats1.agent_handlers[ii].seeder.rng.integers(2**32) rand1_test = stats2.agent_handlers[ii - 2].seeder.rng.integers(2**32) rand2_test = stats2.agent_handlers[ii - 1].seeder.rng.integers(2**32) rand3_test = stats2.agent_handlers[ii].seeder.rng.integers(2**32) assert rand1 != rand2 assert rand1 != rand3 assert rand2 != rand3 assert rand1 == rand1_test assert rand2 == rand2_test assert rand3 == rand3_test stats1.clear_output_dir() stats2.clear_output_dir()
def check_bandit_agent(Agent, environment=BernoulliBandit, seed=42): """ Function used to check a bandit agent in rlberry on a Gaussian bandit problem. Parameters ---------- Agent: rlberry agent module Agent class that we want to test. environment: rlberry env module Environment (i.e bandit instance) on which to test the agent. seed : Seed sequence from which to spawn the random number generator. Returns ------- result : bool Whether the agent is a valid/compatible bandit agent. Examples -------- >>> from rlberry.agents.bandits import IndexAgent >>> from rlberry.utils import check_bandit_agent >>> import numpy as np >>> class UCBAgent(IndexAgent): >>> name = "UCB" >>> def __init__(self, env, **kwargs): >>> def index(r, t): >>> return np.mean(r) + np.sqrt(np.log(t**2) / (2 * len(r))) >>> IndexAgent.__init__(self, env, index, **kwargs) >>> check_bandit_agent(UCBAgent) True """ env_ctor = environment env_kwargs = {} agent1 = AgentManager(Agent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=seed) agent2 = AgentManager(Agent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=seed) agent1.fit() agent2.fit() env = env_ctor(**env_kwargs) state = env.reset() result = True for _ in range(5): # test reproducibility on 5 actions action1 = agent1.agent_handlers[0].policy(state) action2 = agent2.agent_handlers[0].policy(state) if action1 != action2: result = False return result
def test_recursive_vs_not_recursive(): env_ctor = NormalBandit env_kwargs = {} agent1 = AgentManager(UCBAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=TEST_SEED) agent2 = AgentManager( RecursiveUCBAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=TEST_SEED, ) agent1.fit() agent2.fit() env = env_ctor(**env_kwargs) state = env.reset() for _ in range(5): # test reproducibility on 5 actions action1 = agent1.agent_handlers[0].policy(state) action2 = agent2.agent_handlers[0].policy(state) assert action1 == action2
def test_equality(): # Define train and evaluation envs train_env = (GridWorld, {}) # Parameters params = dict(hyperparameter1=-1, hyperparameter2=100) eval_kwargs = dict(eval_horizon=10) # Run AgentManager params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)] stats_agent1 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance, ) stats_agent2 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance, ) stats_agent3 = AgentManager( DummyAgent, train_env, fit_budget=42, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance, ) assert stats_agent1 == stats_agent2 assert stats_agent1 != stats_agent3
def _create_and_fit_agent_manager(output_dir, outdir_id_style): env_ctor = GridWorld env_kwargs = dict(nrows=2, ncols=2, reward_at={(1, 1): 0.1, (2, 2): 1.0}) manager = AgentManager( VIAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=3, output_dir=output_dir, outdir_id_style=outdir_id_style, ) manager.fit() manager.save() return manager
def check_fit_additive(agent, env="continuous_state", init_kwargs=None): """ Check that fitting two times with 10 fit budget is the same as fitting one time with 20 fit budget. Parameters ---------- agent: rlberry agent module Agent class to test. env: tuple (env_ctor, env_kwargs) or str in ["continuous_state", "discrete_state"], default="continuous_state" if tuple, env is the constructor and keywords of the env on which to test. if str in ["continuous_state", "discrete_state"], we use a default Benchmark environment. init_kwargs : dict Arguments required by the agent's constructor. """ if init_kwargs is None: init_kwargs = {} train_env = _make_env(env) agent1 = AgentManager( agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs ) agent1.fit(3) agent1.fit(3) agent2 = AgentManager( agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs ) agent2.fit(6) result = check_agents_almost_equal( agent1.agent_handlers[0], agent2.agent_handlers[0] ) assert ( result ), "Error: fitting the agent two times for 10 steps is not equivalent to fitting it one time for 20 steps."
def test_version(): # Define train and evaluation envs train_env = (GridWorld, {}) # Parameters params = dict(hyperparameter1=-1, hyperparameter2=100) eval_kwargs = dict(eval_horizon=10) # Run AgentManager params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)] stats_agent1 = AgentManager( DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance, ) version = stats_agent1.rlberry_version assert (version is not None) and (len(version) > 0)
def test_hyperparam_optim_cmaes(): # Define train env train_env = (GridWorld, {}) # Run AgentManager stats_agent = AgentManager( DummyAgent, train_env, init_kwargs={}, fit_budget=1, eval_kwargs={"eval_horizon": 5}, n_fit=4, ) # test hyperparameter optimization with CMA-ES sampler stats_agent.optimize_hyperparams(sampler_method="cmaes", n_trials=5) stats_agent.clear_output_dir()
def experiment_generator(): """ Parse command line arguments and yields AgentManager instances. """ args = docopt(__doc__) max_workers = int(args["--max_workers"]) if max_workers == -1: max_workers = None for (_, agent_manager_kwargs) in parse_experiment_config( Path(args["<experiment_path>"]), n_fit=int(args["--n_fit"]), max_workers=max_workers, output_base_dir=args["--output_dir"], parallelization=args["--parallelization"], ): if args["--enable_tensorboard"]: if check_packages.TENSORBOARD_INSTALLED: agent_manager_kwargs.update(dict(enable_tensorboard=True)) else: logger.warning( "Option --enable_tensorboard is not available: tensorboard is not installed." ) yield AgentManager(**agent_manager_kwargs)
def test_discount_optimization(): class ValueIterationAgentToOptimize(ValueIterationAgent): @classmethod def sample_parameters(cls, trial): """ Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/) """ gamma = trial.suggest_categorical("gamma", [0.1, 0.99]) return {"gamma": gamma} env = ( GridWorld, dict( nrows=3, ncols=10, reward_at={ (1, 1): 0.1, (2, 9): 1.0 }, walls=((1, 4), (2, 4), (1, 5)), success_probability=0.9, ), ) vi_params = {"gamma": 0.1, "epsilon": 1e-3} vi_stats = AgentManager( ValueIterationAgentToOptimize, env, fit_budget=0, eval_kwargs=dict(eval_horizon=20), init_kwargs=vi_params, n_fit=4, seed=123, ) vi_stats.optimize_hyperparams(n_trials=5, n_fit=1, sampler_method="random", pruner_method="none") assert vi_stats.optuna_study vi_stats.clear_output_dir()
def test_hyperparam_optim_tpe(): # Define trainenv train_env = (GridWorld, {}) # Run AgentManager stats_agent = AgentManager( DummyAgent, train_env, fit_budget=1, init_kwargs={}, eval_kwargs={"eval_horizon": 5}, n_fit=4, ) # test hyperparameter optimization with TPE sampler # using hyperopt default values sampler_kwargs = TPESampler.hyperopt_parameters() stats_agent.optimize_hyperparams(sampler_kwargs=sampler_kwargs, n_trials=5) stats_agent.clear_output_dir()
def test_hyperparam_optim_random(parallelization): # Define train env train_env = (GridWorld, {}) # Run AgentManager stats_agent = AgentManager( DummyAgent, train_env, init_kwargs={}, fit_budget=1, eval_kwargs={"eval_horizon": 5}, n_fit=4, parallelization=parallelization, ) # test hyperparameter optimization with random sampler stats_agent.optimize_hyperparams(sampler_method="random", n_trials=5, optuna_parallelization=parallelization) stats_agent.clear_output_dir()
def test_jax_dqn(lambda_): if not _IMPORT_SUCCESSFUL: return env = (gym_make, dict(id="CartPole-v0")) params = dict( chunk_size=4, batch_size=128, target_update_interval=5, lambda_=lambda_ ) stats = AgentManager( DQNAgent, env, fit_budget=20, eval_env=env, init_kwargs=params, n_fit=1, parallelization="thread", ) stats.fit() stats.clear_output_dir()
def check_save_load(agent, env="continuous_state", init_kwargs=None): """ Check that the agent save a non-empty file and can load. Parameters ---------- agent: rlberry agent module Agent class to test. env: tuple (env_ctor, env_kwargs) or str in {"continuous_state", "discrete_state"}, default="continuous_state" if tuple, env is the constructor and keywords of the env on which to test. if str in {"continuous_state", "discrete_state"}, we use a default Benchmark environment. init_kwargs : dict Arguments required by the agent's constructor. """ if init_kwargs is None: init_kwargs = {} train_env = _make_env(env) env = train_env[0](**train_env[1]) with tempfile.TemporaryDirectory() as tmpdirname: agent = AgentManager( agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs, output_dir=tmpdirname, ) agent.fit(3) assert ( os.path.getsize(str(agent.output_dir_) + "/agent_handlers/idx_0.pickle") > 1 ), "The saved file is empty." try: agent.load(str(agent.output_dir_) + "/agent_handlers/idx_0.pickle") except Exception: raise RuntimeError("Failed to load the agent file.")
def test_hyperparam_optim_grid(): # Define train env train_env = (GridWorld, {}) # Run AgentManager stats_agent = AgentManager( DummyAgent, train_env, init_kwargs={}, fit_budget=1, eval_kwargs={"eval_horizon": 5}, n_fit=4, ) # test hyperparameter optimization with grid sampler search_space = { "hyperparameter1": [1, 2, 3], "hyperparameter2": [-5, 0, 5] } sampler_kwargs = {"search_space": search_space} stats_agent.optimize_hyperparams(n_trials=3 * 3, sampler_method="grid", sampler_kwargs=sampler_kwargs) stats_agent.clear_output_dir()
HORIZON = 50 BONUS_SCALE_FACTOR = 0.1 MIN_DIST = 0.1 params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) # ------------------------------- # Run AgentManager and save results # -------------------------------- ppo_stats = AgentManager( PPOAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, output_dir="dev/", parallelization="process", ) ppo_stats.fit() # fit the 4 agents ppo_stats_fname = ppo_stats.save() del ppo_stats # ------------------------------- # Load and plot results # -------------------------------- ppo_stats = AgentManager.load(ppo_stats_fname) # learning curves plot_writer_data(
# ----------------------------- N_EPISODES = 100 GAMMA = 0.99 HORIZON = 50 params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) # ----------------------------- # Run AgentManager # ----------------------------- ppo_stats = AgentManager( PPOAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, ) ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={"comment": "worker_0"}) ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={"comment": "worker_1"}) agent_manager_list = [ppo_stats] agent_manager_list[0].fit() agent_manager_list[0].save(
} params_a2c = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) # ----------------------------- # Run AgentManager # ----------------------------- rsucbvi_stats = AgentManager( RSUCBVIAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params, eval_kwargs=eval_kwargs, n_fit=4, seed=123, enable_tensorboard=True, default_writer_kwargs=dict( maxlen=N_EPISODES - 10, log_interval=5.0, ), ) rskernel_stats = AgentManager( RSKernelUCBVIAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_kernel, eval_kwargs=eval_kwargs, n_fit=4, seed=123, enable_tensorboard=True,
# we could also record actions with # self.env = WriterWrapper(self.env, self.writer, # write_scalar = "action") env_ctor = GridWorld env_kwargs = dict( nrows=3, ncols=10, reward_at={(1, 1): 0.1, (2, 9): 1.0}, walls=((1, 4), (2, 4), (1, 5)), success_probability=0.7, ) env = env_ctor(**env_kwargs) agent = AgentManager(VIAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=3) agent.fit(budget=10) # comment the line above if you only want to load data from rlberry_data. # We use the following preprocessing function to plot the cumulative reward. def compute_reward(rewards): return np.cumsum(rewards) # Plot of the cumulative reward. output = plot_writer_data( agent, tag="reward", preprocess_func=compute_reward, title="Cumulative Reward" ) # The output is for 500 global steps because it uses 10 fit_budget * horizon
env = VecFrameStack(env, n_stack=4) env = ScalarizeEnvWrapper(env) return env # # Testing single agent # if __name__ == "__main__": # # Training several agents and comparing different hyperparams # stats = AgentManager( A2CAgent, train_env=(env_constructor, None), eval_env=(eval_env_constructor, None), eval_kwargs=dict(eval_horizon=200), agent_name="A2C baseline", fit_budget=5000, init_kwargs=dict(policy="CnnPolicy", verbose=10), n_fit=4, parallelization="process", output_dir="dev/stable_baselines_atari", seed=123, ) stats.fit() stats.optimize_hyperparams(timeout=60, n_fit=2)
# env = env_ctor(**env_kwargs) # agent = A2CAgent(env, 'MlpPolicy', verbose=1) # agent.fit(budget=1000) # # Training several agents and comparing different hyperparams # from rlberry.manager import AgentManager, MultipleManagers, evaluate_agents stats = AgentManager( A2CAgent, (env_ctor, env_kwargs), agent_name="A2C baseline", init_kwargs=dict(policy="MlpPolicy", verbose=1), fit_kwargs=dict(log_interval=1000), fit_budget=2500, eval_kwargs=dict(eval_horizon=400), n_fit=4, parallelization="process", output_dir="dev/stable_baselines", seed=123, ) stats_alternative = AgentManager( A2CAgent, (env_ctor, env_kwargs), agent_name="A2C optimized", init_kwargs=dict(policy="MlpPolicy", verbose=1), fit_kwargs=dict(log_interval=1000), fit_budget=2500, eval_kwargs=dict(eval_horizon=400),
"gamma": GAMMA, "horizon": HORIZON, } params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) # ----------------------------- # Run AgentManager # ----------------------------- oracle_stats = AgentManager( MBQVIAgent, d_train_env, fit_budget=0, init_kwargs=params_oracle, eval_kwargs=eval_kwargs, n_fit=4, agent_name="Oracle", ) ppo_stats = AgentManager( PPOAgent, train_env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, agent_name="PPO", ) agent_manager_list = [oracle_stats, ppo_stats]
# # Create managers # if __name__ == "__main__": managers = MultipleManagers() # Standard DQN managers.append( AgentManager( DQNAgent, TRAIN_ENV, agent_name="DQN", init_kwargs=DQN_PARAMS, fit_kwargs=FIT_KWARGS, n_fit=N_FIT, max_workers=MAX_WORKERS, parallelization="process", seed=42, enable_tensorboard=ENABLE_TENSORBOARD, output_dir="temp/dqn_example", )) # DQN with Q(lambda) managers.append( AgentManager( DQNAgent, TRAIN_ENV, agent_name="DQN + Q($\\lambda$)", init_kwargs=DQN_LAMBDA_PARAMS, fit_kwargs=FIT_KWARGS,
rewards = switching_rewards(T, rate=5.0) # Construction of the experiment env_ctor = AdversarialBandit env_kwargs = {"rewards": rewards} Agents_class = [EXP3Agent, BernoulliTSAgent] agents = [ AgentManager( Agent, (env_ctor, env_kwargs), init_kwargs={}, fit_budget=T, n_fit=M, parallelization="process", mp_context="fork", ) for Agent in Agents_class ] # these parameters should give parallel computing even in notebooks # Agent training for agent in agents: agent.fit()
# Parameters of the problem means = np.array([0, 0.9, 1]) # means of the arms T = 3000 # Horizon M = 20 # number of MC simu # Construction of the experiment env_ctor = NormalBandit env_kwargs = {"means": means, "stds": 2 * np.ones(len(means))} agent = AgentManager( UCBAgent, (env_ctor, env_kwargs), fit_budget=T, init_kwargs={"B": 2}, n_fit=M, parallelization="process", mp_context="fork", ) # these parameters should give parallel computing even in notebooks # Agent training agent.fit() # Compute and plot (pseudo-)regret def compute_pseudo_regret(actions): return np.cumsum(np.max(means) - means[actions.astype(int)])
"use_bonus": True, "uncertainty_estimator_kwargs": { "uncertainty_estimator_fn": uncertainty_estimator_fn }, } eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) # ----------------------------- # Run AgentManager # ----------------------------- ppo_stats = AgentManager( PPOAgent, env, fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, agent_name="PPO", ) ppo_bonus_stats = AgentManager( PPOAgent, env, fit_budget=N_EPISODES, init_kwargs=params_ppo_bonus, eval_kwargs=eval_kwargs, n_fit=4, agent_name="PPO-Bonus", ) agent_manager_list = [ppo_bonus_stats, ppo_stats]
def execute_message(message: interface.Message, resources: interface.Resources) -> interface.Message: response = interface.Message.create(command=interface.Command.ECHO) # LIST_RESOURCES if message.command == interface.Command.LIST_RESOURCES: info = {} for rr in resources: info[rr] = resources[rr]["description"] response = interface.Message.create(info=info) # AGENT_MANAGER_CREATE_INSTANCE elif message.command == interface.Command.AGENT_MANAGER_CREATE_INSTANCE: params = message.params base_dir = pathlib.Path(metadata_utils.RLBERRY_DEFAULT_DATA_DIR) if "output_dir" in params: params[ "output_dir"] = base_dir / "server_data" / params["output_dir"] else: params["output_dir"] = base_dir / "server_data/" agent_manager = AgentManager(**params) filename = str(agent_manager.save()) response = interface.Message.create(info=dict( filename=filename, agent_name=agent_manager.agent_name, output_dir=str(agent_manager.output_dir).replace( "server_data/", "client_data/"), )) del agent_manager # AGENT_MANAGER_FIT elif message.command == interface.Command.AGENT_MANAGER_FIT: filename = message.params["filename"] budget = message.params["budget"] extra_params = message.params["extra_params"] agent_manager = AgentManager.load(filename) agent_manager.fit(budget, **extra_params) agent_manager.save() response = interface.Message.create(command=interface.Command.ECHO) del agent_manager # AGENT_MANAGER_EVAL elif message.command == interface.Command.AGENT_MANAGER_EVAL: filename = message.params["filename"] agent_manager = AgentManager.load(filename) eval_output = agent_manager.eval_agents( message.params["n_simulations"]) response = interface.Message.create(data=dict(output=eval_output)) del agent_manager # AGENT_MANAGER_CLEAR_OUTPUT_DIR elif message.command == interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR: filename = message.params["filename"] agent_manager = AgentManager.load(filename) agent_manager.clear_output_dir() response = interface.Message.create( message=f"Cleared output dir: {agent_manager.output_dir}") del agent_manager # AGENT_MANAGER_CLEAR_HANDLERS elif message.command == interface.Command.AGENT_MANAGER_CLEAR_HANDLERS: filename = message.params["filename"] agent_manager = AgentManager.load(filename) agent_manager.clear_handlers() agent_manager.save() response = interface.Message.create( message=f"Cleared handlers: {filename}") del agent_manager # AGENT_MANAGER_SET_WRITER elif message.command == interface.Command.AGENT_MANAGER_SET_WRITER: filename = message.params["filename"] agent_manager = AgentManager.load(filename) agent_manager.set_writer(**message.params["kwargs"]) agent_manager.save() del agent_manager # AGENT_MANAGER_OPTIMIZE_HYPERPARAMS elif message.command == interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS: filename = message.params["filename"] agent_manager = AgentManager.load(filename) best_params_dict = agent_manager.optimize_hyperparams( **message.params["kwargs"]) agent_manager.save() del agent_manager response = interface.Message.create(data=best_params_dict) # AGENT_MANAGER_GET_WRITER_DATA elif message.command == interface.Command.AGENT_MANAGER_GET_WRITER_DATA: # writer scalar data filename = message.params["filename"] agent_manager = AgentManager.load(filename) writer_data = agent_manager.get_writer_data() writer_data = writer_data or dict() for idx in writer_data: writer_data[idx] = writer_data[idx].to_csv(index=False) # tensoboard data tensorboard_bin_data = None if agent_manager.tensorboard_dir is not None: tensorboard_zip_file = rlberry.utils.io.zipdir( agent_manager.tensorboard_dir, agent_manager.output_dir / "tensorboard_data.zip", ) if tensorboard_zip_file is not None: tensorboard_bin_data = open(tensorboard_zip_file, "rb").read() tensorboard_bin_data = base64.b64encode( tensorboard_bin_data).decode("ascii") response = interface.Message.create( data=dict(writer_data=writer_data, tensorboard_bin_data=tensorboard_bin_data)) del agent_manager # end return response
params["optql"] = { "horizon": HORIZON, "gamma": GAMMA, "bonus_scale_factor": 1.0, } eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) multimanagers = MultipleManagers() multimanagers.append( AgentManager( UCBVIAgent, env, fit_budget=N_EP, init_kwargs=params["ucbvi"], eval_kwargs=eval_kwargs, )) multimanagers.append( AgentManager( OptQLAgent, env, fit_budget=N_EP, init_kwargs=params["optql"], eval_kwargs=eval_kwargs, )) multimanagers.run()
) params_greedy = dict( feature_map_fn=feature_map_fn, horizon=horizon, bonus_scale_factor=0.0, gamma=gamma, ) params_oracle = dict(horizon=horizon, gamma=gamma) stats = AgentManager( LSVIUCBAgent, env, init_kwargs=params, fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=4, parallelization=parallelization, ) # UCBVI baseline stats_ucbvi = AgentManager( UCBVIAgent, env, init_kwargs=params_ucbvi, fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=4, parallelization=parallelization, )