def test_lsvi_random_exploration(): seeding.set_global_seed(123) env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=250, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5, bonus_scale_factor=0.0) agent.fit() # estimated Q S = env.observation_space.n Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) agent_opt.fit() Q = agent_opt.Q[0, :, :] print(Q) print("---") print(Q_est) print("-------") print(np.abs(Q-Q_est)) # Check error assert np.abs(Q-Q_est).mean() < 0.1
def test_multithread_seeding(give_seed): """ Checks that different seeds are given to different threads, even if we don't call set_global_seed in each thread. """ seeding.set_global_seed(123) for _ in range(10): with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for seed in seeding.spawn(2): if give_seed: futures.append( executor.submit(get_random_number_setting_seed, seed) ) else: futures.append( executor.submit(get_random_number) ) results = [] for future in concurrent.futures.as_completed(futures): results.append( future.result() ) assert results[0] != results[1]
def test_discount_optimization(): seeding.set_global_seed(42) class ValueIterationAgentToOptimize(ValueIterationAgent): @classmethod def sample_parameters(cls, trial): """ Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/) """ gamma = trial.suggest_categorical('gamma', [0.1, 0.99]) return {'gamma': gamma} env = GridWorld(nrows=3, ncols=10, reward_at={(1, 1): 0.1, (2, 9): 1.0}, walls=((1, 4), (2, 4), (1, 5)), success_probability=0.9) vi_params = {'gamma': 0.1, 'epsilon': 1e-3} vi_stats = AgentStats(ValueIterationAgentToOptimize, env, eval_horizon=20, init_kwargs=vi_params, n_fit=4, n_jobs=1) vi_stats.optimize_hyperparams(n_trials=5, timeout=30, n_sim=5, n_fit=1, n_jobs=1, sampler_method='random', pruner_method='none') assert vi_stats.best_hyperparams['gamma'] == 0.99
def test_joblib_seeding_giving_seed(backend): """ Solves the problem of test_joblib_seeding() by setting global seed in each of the subprocesses/threads """ seeding.set_global_seed(123) workers_output = Parallel(n_jobs=4, verbose=5, backend=backend)( delayed(get_random_number_setting_seed)(seed) for seed in seeding.spawn(2)) assert workers_output[0] != workers_output[1]
def test_double_wrapper_copy_reseeding(ModelClass): seeding.set_global_seed(123) env = Wrapper(Wrapper(ModelClass())) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_copy_reseeding(env_name): seeding.set_global_seed(123) env = gym_make(env_name) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def run_experiment(params, optimize_hyperparams, rlberry_seed): """ Main experiment function """ seeding.set_global_seed(rlberry_seed) # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 2nd PPO thread # stats['ppo'].set_writer(1, None) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 1st A2C thread # stats['a2c'].set_writer(0, None) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2) # Fit with best hyperparams and save results for stats in agent_stats_list: stats.fit() stats.save_results() # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
def test_gym_copy_reseeding(): seeding.set_global_seed(123) if _GYM_INSTALLED: gym_env = gym.make('Acrobot-v1') env = Wrapper(gym_env) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_gym_copy_reseeding_2(): seeding.set_global_seed(123) if _GYM_INSTALLED: gym_env = gym.make('Acrobot-v1') # nested wrapping env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1)) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_seeding(): seed = 123 seeding.set_global_seed(seed) # check that reimports do not cause problems import rlberry import rlberry.seeding # assert seeding._GLOBAL_SEED_SEQ.entropy == seed _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 1 # check that reimports do not cause problems import rlberry import rlberry.seeding assert seeding._GLOBAL_SEED_SEQ.entropy == seed # _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2
def test_seeding(): seed = 123 seeding.set_global_seed(seed) # check that reimports do not cause problems import rlberry import rlberry.seeding # assert seeding._GLOBAL_SEED_SEQ.entropy == seed _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2 # counting the global rng generated automatically # check that reimports do not cause problems import rlberry import rlberry.seeding assert seeding._GLOBAL_SEED_SEQ.entropy == seed # _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 3
def test_agent_stats_seeding(): sd.set_global_seed(3456) for env in [MountainCar(), (gym_make, {'env_name': 'MountainCar-v0'})]: agent_stats = AgentStats(RSUCBVIAgent, env, init_kwargs={ 'n_episodes': 2, 'horizon': 10 }, n_fit=6) agent_stats.fit() for ii in range(2, agent_stats.n_fit): traj1 = get_env_trajectory(agent_stats.fitted_agents[ii - 2].env, horizon=10) traj2 = get_env_trajectory(agent_stats.fitted_agents[ii - 1].env, horizon=10) traj3 = get_env_trajectory(agent_stats.fitted_agents[ii].env, horizon=10) assert not compare_trajectories(traj1, traj2) assert not compare_trajectories(traj1, traj3) assert not compare_trajectories(traj2, traj3)
def test_env_seeding(ModelClass): seeding.set_global_seed(123) env1 = ModelClass() seeding.set_global_seed(456) env2 = ModelClass() seeding.set_global_seed(123) env3 = ModelClass() seeding.set_global_seed(123) env4 = ModelClass() seeding.safe_reseed(env4) if deepcopy(env1).is_online(): traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) traj4 = get_env_trajectory(env4, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3) assert not compare_trajectories(traj3, traj4)
def test_rescale_wrapper_seeding(ModelClass): seeding.set_global_seed(123) env1 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeding.set_global_seed(456) env2 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeding.set_global_seed(123) env3 = RescaleRewardWrapper(ModelClass(), (0, 1)) if deepcopy(env1).is_online(): traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3)
def test_env_seeding(env_name): seeding.set_global_seed(123) env1 = gym_make(env_name) seeding.set_global_seed(456) env2 = gym_make(env_name) seeding.set_global_seed(123) env3 = gym_make(env_name) if deepcopy(env1).is_online(): traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3)
def test_random_numbers(): seed = 43 seeding.set_global_seed(seed) rng1 = seeding.get_rng() data1 = rng1.integers(100, size=1000) seed = 44 seeding.set_global_seed(seed) rng2 = seeding.get_rng() data2 = rng2.integers(100, size=1000) seed = 44 seeding.set_global_seed(seed) rng3 = seeding.get_rng() data3 = rng3.integers(100, size=1000) assert (data1 != data2).sum() > 5 assert (data2 != data3).sum() == 0
def test_lsvi_without_bonus(): seeding.set_global_seed(123) def lsvi_debug_gather_data(agent): """ Function to gather data sampling uniformly states and actions """ N = agent.n_episodes*agent.horizon count = 0 while count < N: state = agent.env.observation_space.sample() action = agent.env.action_space.sample() next_state, reward, done, info = agent.env.sample(state, action) # # feat = agent.feature_map.map(state, action) outer_prod = np.outer(feat, feat) inv = agent.lambda_mat_inv # agent.lambda_mat += np.outer(feat, feat) # update inverse agent.lambda_mat_inv -= \ (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) # update history agent.reward_hist[count] = reward agent.state_hist.append(state) agent.action_hist.append(action) agent.nstate_hist.append(next_state) # tt = agent.total_time_steps agent.feat_hist[tt, :] = agent.feature_map.map(state, action) for aa in range(agent.env.action_space.n): agent.feat_ns_all_actions[tt, aa, :] = \ agent.feature_map.map(next_state, aa) # increments agent.total_time_steps += 1 count += 1 env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=100, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5) lsvi_debug_gather_data(agent) # estimated Q S = env.observation_space.n Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) agent_opt.fit() Q = agent_opt.Q[0, :, :] print(Q) print("---") print(Q_est) print("-------") print(np.abs(Q-Q_est)) # Check error assert Q_est == pytest.approx(Q, rel=0.01)
from rlberry.agents.cem import CEMAgent from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env import rlberry.seeding as seeding seeding.set_global_seed(1234) env = get_benchmark_env(level=1) n_episodes = 1000 horizon = 30 gamma = 0.99 params = { 'n_episodes': n_episodes, 'horizon': horizon, 'gamma': gamma, 'batch_size': 20, 'percentile': 70, 'learning_rate': 0.01 } agent = CEMAgent(env, **params) agent.fit() env.enable_rendering() state = env.reset() for tt in range(4 * horizon): action = agent.policy(state) next_state, reward, done, _ = env.step(action) state = next_state
def get_random_number_setting_seed(global_seed): seeding.set_global_seed(global_seed) return seeding.generate_uniform_seed()