def test_lsvi_random_exploration():

    seeding.set_global_seed(123)

    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=250,
                         feature_map_fn=feature_map_fn,
                         horizon=20,
                         gamma=0.99,
                         reg_factor=1e-5,
                         bonus_scale_factor=0.0)
    agent.fit()

    # estimated Q
    S = env.observation_space.n
    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    print(Q)
    print("---")
    print(Q_est)

    print("-------")
    print(np.abs(Q-Q_est))
    # Check error
    assert np.abs(Q-Q_est).mean() < 0.1
def test_multithread_seeding(give_seed):
    """
    Checks that different seeds are given to different threads,
    even if we don't call set_global_seed in each thread.
    """
    seeding.set_global_seed(123)
    for _ in range(10):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for seed in seeding.spawn(2):
                if give_seed:
                    futures.append(
                        executor.submit(get_random_number_setting_seed, seed)
                    )
                else:
                    futures.append(
                        executor.submit(get_random_number)
                    )

            results = []
            for future in concurrent.futures.as_completed(futures):
                results.append(
                    future.result()
                )

            assert results[0] != results[1]
Exemple #3
0
def test_discount_optimization():
    seeding.set_global_seed(42)

    class ValueIterationAgentToOptimize(ValueIterationAgent):
        @classmethod
        def sample_parameters(cls, trial):
            """
            Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/)
            """
            gamma = trial.suggest_categorical('gamma', [0.1, 0.99])
            return {'gamma': gamma}

    env = GridWorld(nrows=3, ncols=10,
                    reward_at={(1, 1): 0.1, (2, 9): 1.0},
                    walls=((1, 4), (2, 4), (1, 5)),
                    success_probability=0.9)

    vi_params = {'gamma': 0.1, 'epsilon': 1e-3}

    vi_stats = AgentStats(ValueIterationAgentToOptimize, env, eval_horizon=20, init_kwargs=vi_params, n_fit=4, n_jobs=1)

    vi_stats.optimize_hyperparams(n_trials=5, timeout=30, n_sim=5, n_fit=1, n_jobs=1,
                                  sampler_method='random', pruner_method='none')

    assert vi_stats.best_hyperparams['gamma'] == 0.99
def test_joblib_seeding_giving_seed(backend):
    """
    Solves the problem of test_joblib_seeding() by setting global seed in each of the
    subprocesses/threads
    """
    seeding.set_global_seed(123)
    workers_output = Parallel(n_jobs=4,
                              verbose=5,
                              backend=backend)(
            delayed(get_random_number_setting_seed)(seed) for seed in seeding.spawn(2))
    assert workers_output[0] != workers_output[1]
def test_double_wrapper_copy_reseeding(ModelClass):

    seeding.set_global_seed(123)
    env = Wrapper(Wrapper(ModelClass()))

    c_env = deepcopy(env)
    c_env.reseed()

    if deepcopy(env).is_online():
        traj1 = get_env_trajectory(env, 500)
        traj2 = get_env_trajectory(c_env, 500)
        assert not compare_trajectories(traj1, traj2)
Exemple #6
0
def test_copy_reseeding(env_name):

    seeding.set_global_seed(123)
    env = gym_make(env_name)

    c_env = deepcopy(env)
    c_env.reseed()

    if deepcopy(env).is_online():
        traj1 = get_env_trajectory(env, 500)
        traj2 = get_env_trajectory(c_env, 500)
        assert not compare_trajectories(traj1, traj2)
Exemple #7
0
def run_experiment(params,
                   optimize_hyperparams,
                   rlberry_seed):
    """
    Main experiment function
    """
    seeding.set_global_seed(rlberry_seed)

    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 2nd PPO thread
    # stats['ppo'].set_writer(1, None)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 1st A2C thread
    # stats['a2c'].set_writer(0, None)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2)

    # Fit with best hyperparams and save results
    for stats in agent_stats_list:
        stats.fit()
        stats.save_results()

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
def test_gym_copy_reseeding():
    seeding.set_global_seed(123)
    if _GYM_INSTALLED:
        gym_env = gym.make('Acrobot-v1')
        env = Wrapper(gym_env)

        c_env = deepcopy(env)
        c_env.reseed()

        if deepcopy(env).is_online():
            traj1 = get_env_trajectory(env, 500)
            traj2 = get_env_trajectory(c_env, 500)
            assert not compare_trajectories(traj1, traj2)
def test_gym_copy_reseeding_2():
    seeding.set_global_seed(123)
    if _GYM_INSTALLED:
        gym_env = gym.make('Acrobot-v1')
        # nested wrapping
        env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1))

        c_env = deepcopy(env)
        c_env.reseed()

        if deepcopy(env).is_online():
            traj1 = get_env_trajectory(env, 500)
            traj2 = get_env_trajectory(c_env, 500)
            assert not compare_trajectories(traj1, traj2)
Exemple #10
0
def test_seeding():
    seed = 123
    seeding.set_global_seed(seed)

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    #

    assert seeding._GLOBAL_SEED_SEQ.entropy == seed

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 1

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    assert seeding._GLOBAL_SEED_SEQ.entropy == seed
    #

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2
def test_seeding():
    seed = 123
    seeding.set_global_seed(seed)

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    #

    assert seeding._GLOBAL_SEED_SEQ.entropy == seed

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2  # counting the global rng generated automatically

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    assert seeding._GLOBAL_SEED_SEQ.entropy == seed
    #

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 3
Exemple #12
0
def test_agent_stats_seeding():
    sd.set_global_seed(3456)
    for env in [MountainCar(), (gym_make, {'env_name': 'MountainCar-v0'})]:
        agent_stats = AgentStats(RSUCBVIAgent,
                                 env,
                                 init_kwargs={
                                     'n_episodes': 2,
                                     'horizon': 10
                                 },
                                 n_fit=6)
        agent_stats.fit()

        for ii in range(2, agent_stats.n_fit):
            traj1 = get_env_trajectory(agent_stats.fitted_agents[ii - 2].env,
                                       horizon=10)
            traj2 = get_env_trajectory(agent_stats.fitted_agents[ii - 1].env,
                                       horizon=10)
            traj3 = get_env_trajectory(agent_stats.fitted_agents[ii].env,
                                       horizon=10)
            assert not compare_trajectories(traj1, traj2)
            assert not compare_trajectories(traj1, traj3)
            assert not compare_trajectories(traj2, traj3)
Exemple #13
0
def test_env_seeding(ModelClass):

    seeding.set_global_seed(123)
    env1 = ModelClass()

    seeding.set_global_seed(456)
    env2 = ModelClass()

    seeding.set_global_seed(123)
    env3 = ModelClass()

    seeding.set_global_seed(123)
    env4 = ModelClass()
    seeding.safe_reseed(env4)

    if deepcopy(env1).is_online():
        traj1 = get_env_trajectory(env1, 500)
        traj2 = get_env_trajectory(env2, 500)
        traj3 = get_env_trajectory(env3, 500)
        traj4 = get_env_trajectory(env4, 500)

        assert not compare_trajectories(traj1, traj2)
        assert compare_trajectories(traj1, traj3)
        assert not compare_trajectories(traj3, traj4)
def test_rescale_wrapper_seeding(ModelClass):

    seeding.set_global_seed(123)
    env1 = RescaleRewardWrapper(ModelClass(), (0, 1))

    seeding.set_global_seed(456)
    env2 = RescaleRewardWrapper(ModelClass(), (0, 1))

    seeding.set_global_seed(123)
    env3 = RescaleRewardWrapper(ModelClass(), (0, 1))

    if deepcopy(env1).is_online():
        traj1 = get_env_trajectory(env1, 500)
        traj2 = get_env_trajectory(env2, 500)
        traj3 = get_env_trajectory(env3, 500)

        assert not compare_trajectories(traj1, traj2)
        assert compare_trajectories(traj1, traj3)
Exemple #15
0
def test_env_seeding(env_name):

    seeding.set_global_seed(123)
    env1 = gym_make(env_name)

    seeding.set_global_seed(456)
    env2 = gym_make(env_name)

    seeding.set_global_seed(123)
    env3 = gym_make(env_name)

    if deepcopy(env1).is_online():
        traj1 = get_env_trajectory(env1, 500)
        traj2 = get_env_trajectory(env2, 500)
        traj3 = get_env_trajectory(env3, 500)

        assert not compare_trajectories(traj1, traj2)
        assert compare_trajectories(traj1, traj3)
Exemple #16
0
def test_random_numbers():
    seed = 43
    seeding.set_global_seed(seed)
    rng1 = seeding.get_rng()
    data1 = rng1.integers(100, size=1000)

    seed = 44
    seeding.set_global_seed(seed)
    rng2 = seeding.get_rng()
    data2 = rng2.integers(100, size=1000)

    seed = 44
    seeding.set_global_seed(seed)
    rng3 = seeding.get_rng()
    data3 = rng3.integers(100, size=1000)

    assert (data1 != data2).sum() > 5
    assert (data2 != data3).sum() == 0
def test_lsvi_without_bonus():
    seeding.set_global_seed(123)

    def lsvi_debug_gather_data(agent):
        """
        Function to gather data sampling uniformly
        states and actions
        """
        N = agent.n_episodes*agent.horizon
        count = 0
        while count < N:
            state = agent.env.observation_space.sample()
            action = agent.env.action_space.sample()
            next_state, reward, done, info = agent.env.sample(state, action)
            #
            #
            feat = agent.feature_map.map(state, action)
            outer_prod = np.outer(feat, feat)
            inv = agent.lambda_mat_inv

            #
            agent.lambda_mat += np.outer(feat, feat)
            # update inverse
            agent.lambda_mat_inv -= \
                (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)

            # update history
            agent.reward_hist[count] = reward
            agent.state_hist.append(state)
            agent.action_hist.append(action)
            agent.nstate_hist.append(next_state)

            #
            tt = agent.total_time_steps
            agent.feat_hist[tt, :] = agent.feature_map.map(state, action)
            for aa in range(agent.env.action_space.n):
                agent.feat_ns_all_actions[tt, aa, :] = \
                    agent.feature_map.map(next_state, aa)

            # increments
            agent.total_time_steps += 1
            count += 1

    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=100,
                         feature_map_fn=feature_map_fn,
                         horizon=20,
                         gamma=0.99,
                         reg_factor=1e-5)

    lsvi_debug_gather_data(agent)
    # estimated Q
    S = env.observation_space.n
    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    print(Q)
    print("---")
    print(Q_est)

    print("-------")
    print(np.abs(Q-Q_est))
    # Check error
    assert Q_est == pytest.approx(Q, rel=0.01)
Exemple #18
0
from rlberry.agents.cem import CEMAgent
from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
import rlberry.seeding as seeding

seeding.set_global_seed(1234)

env = get_benchmark_env(level=1)

n_episodes = 1000
horizon = 30
gamma = 0.99

params = {
    'n_episodes': n_episodes,
    'horizon': horizon,
    'gamma': gamma,
    'batch_size': 20,
    'percentile': 70,
    'learning_rate': 0.01
}

agent = CEMAgent(env, **params)
agent.fit()

env.enable_rendering()
state = env.reset()
for tt in range(4 * horizon):
    action = agent.policy(state)
    next_state, reward, done, _ = env.step(action)
    state = next_state
def get_random_number_setting_seed(global_seed):
    seeding.set_global_seed(global_seed)
    return seeding.generate_uniform_seed()