def test_gym_copy_reseeding_2(): seeder = Seeder(123) if _GYM_INSTALLED: gym_env = gym.make("Acrobot-v1") # nested wrapping env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1)) env.reseed(seeder) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_rescale_wrapper_seeding(ModelClass): seeding.set_global_seed(123) env1 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeding.set_global_seed(456) env2 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeding.set_global_seed(123) env3 = RescaleRewardWrapper(ModelClass(), (0, 1)) if deepcopy(env1).is_online(): traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3)
from rlberry.envs import Acrobot from rlberry.agents import RSKernelUCBVIAgent from rlberry.utils.logging import configure_logging from rlberry.wrappers import RescaleRewardWrapper configure_logging("DEBUG") env = Acrobot() # rescake rewards to [0, 1] env = RescaleRewardWrapper(env, (0.0, 1.0)) agent = RSKernelUCBVIAgent(env, n_episodes=500, gamma=0.99, horizon=300, bonus_scale_factor=0.01, min_dist=0.2, bandwidth=0.05, beta=1.0, kernel_type="gaussian") agent.fit() env.enable_rendering() state = env.reset() time_before_done = 0 ended = False for tt in range(4 * agent.horizon): action = agent.policy(state) next_state, reward, done, _ = env.step(action) if not done and not ended:
""" ===================== Demo: demo_gym_wrapper ===================== """ from rlberry.envs import gym_make from rlberry.agents import RSUCBVIAgent from rlberry.wrappers import RescaleRewardWrapper env = gym_make("Acrobot-v1") env.reward_range = (-1.0, 0.0) # missing in gym implementation # rescake rewards to [0, 1] env = RescaleRewardWrapper(env, (0.0, 1.0)) agent = RSUCBVIAgent(env, gamma=0.99, horizon=200, bonus_scale_factor=0.1, min_dist=0.2) agent.fit(budget=10) state = env.reset() for tt in range(200): action = agent.policy(state) next_state, reward, done, _ = env.step(action) state = next_state env.render() env.close()