def __init__(self, reward_free=False, difficulty=0, array_observation=False): self.reward_free = reward_free self.difficulty = difficulty self.array_observation = array_observation if difficulty not in [0, 1, 2]: raise ValueError("FourRoom difficulty must be in [0, 1, 2]") # Common parameters nrows = 9 ncols = 9 start_coord = (0, 0) terminal_states = ((8, 0), ) success_probability = 0.95 # walls = () for ii in range(9): if ii not in [2, 6]: walls += ((ii, 4), ) for jj in range(9): if jj != 7: walls += ((4, jj), ) # Default reward according to the difficulty if difficulty in [0, 1]: default_reward = 0.0 elif difficulty == 2: default_reward = -0.005 # Rewards according to the difficulty if self.reward_free: reward_at = {} else: if difficulty == 0: reward_at = {(8, 0): 1.0} elif difficulty in [1, 2]: reward_at = { (8, 0): 1.0, (3, 3): 0.1, } # Init base class GridWorld.__init__( self, nrows=nrows, ncols=ncols, start_coord=start_coord, terminal_states=terminal_states, success_probability=success_probability, reward_at=reward_at, walls=walls, default_reward=default_reward, ) # spaces if self.array_observation: self.observation_space = spaces.Box(0.0, 1.0, shape=(2, ))
def test_lsvi_random_exploration(): env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) env.reseed(123) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent( env, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5, bonus_scale_factor=0.0, ) agent.reseed(123) agent.fit(budget=250) # estimated Q S = env.observation_space.n Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) agent_opt.fit() Q = agent_opt.Q[0, :, :] print(Q) print("---") print(Q_est) print("-------") print(np.abs(Q - Q_est)) # Check error assert np.abs(Q - Q_est).mean() < 0.1
def __init__(self, reward_free=False, array_observation=False): self.reward_free = reward_free self.array_observation = array_observation # Common parameters nrows = 13 ncols = 17 start_coord = (5, 1) terminal_states = ((7, 7),) success_probability = 0.95 # walls = () for ii in range(13): walls += ((ii, 0),) walls += ((ii, 16),) for jj in range(17): walls += ((0, jj),) walls += ((12, jj),) for ii in range(13): if ii not in [1, 11]: walls += ((ii, 6),) walls += ((ii, 10),) walls += ((11, 6),) for jj in range(17): if jj not in [1, 15]: walls += ((6, jj),) # Default reward according to the difficulty default_reward = 0 # Rewards according to the difficulty if self.reward_free: reward_at = {} else: reward_at = { (7, 7): 10.0, (8, 2): 1.0, (10, 3): 1.0 } for jj in range(7, 16): for ii in range(1, 12): if (ii, jj) not in walls and (ii, jj) != (7, 7): reward_at[(ii, jj)] = -0.05 # Init base class GridWorld.__init__(self, nrows=nrows, ncols=ncols, start_coord=start_coord, terminal_states=terminal_states, success_probability=success_probability, reward_at=reward_at, walls=walls, default_reward=default_reward) # spaces if self.array_observation: self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
def test_gridworld_aux_functions(): env = GridWorld(nrows=5, ncols=5, walls=((1, 1), ), reward_at={ (4, 4): 1, (4, 3): -1 }) env.log() # from FiniteMDP env.render_ascii() # from GridWorld vals = np.ones(env.observation_space.n) env.display_values(vals) env.print_transition_at(0, 0, 'up')
def __init__(self, reward_free=False, array_observation=False): self.reward_free = reward_free self.array_observation = array_observation # Common parameters nrows = 11 ncols = 17 start_coord = (0, 0) terminal_states = ((10, 0), ) success_probability = 0.95 # walls = () for ii in range(11): if ii not in [2, 8]: walls += ((ii, 5), ) walls += ((ii, 11), ) for jj in range(17): if jj != 15: walls += ((5, jj), ) # Default reward according to the difficulty default_reward = -0.001 # Rewards according to the difficulty if self.reward_free: reward_at = {} else: reward_at = { (10, 0): 10.0, (4, 4): 0.1, } # Init base class GridWorld.__init__( self, nrows=nrows, ncols=ncols, start_coord=start_coord, terminal_states=terminal_states, success_probability=success_probability, reward_at=reward_at, walls=walls, default_reward=default_reward, ) # spaces if self.array_observation: self.observation_space = spaces.Box(0.0, 1.0, shape=(2, ))
def test_lsvi_optimism(): env = GridWorld(nrows=2, ncols=2, walls=()) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=250, gamma=0.99, feature_map_fn=feature_map_fn, horizon=3, bonus_scale_factor=3, reg_factor=0.000001) agent.fit() # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=3) agent_opt.fit() Q = agent_opt.Q[0, :, :] # optimistic Q S = env.observation_space.n A = env.action_space.n Q_optimistic = np.zeros((S, A)) for ss in range(S): Q_optimistic[ss, :] = agent._compute_q_vec( agent.w_vec[0, :], ss, agent.bonus_scale_factor) print(Q) print(Q_optimistic) assert (Q_optimistic - Q).min() >= -1e-5
def test_lsvi_ucb_matrix_inversion(FeatMapClass): env = GridWorld(nrows=3, ncols=3, walls=()) def feature_map_fn(_env): return FeatMapClass(_env.observation_space.n, _env.action_space.n) reg_factor = 0.1 agent = LSVIUCBAgent(env, n_episodes=50, feature_map_fn=feature_map_fn, horizon=10, reg_factor=reg_factor) agent.fit() assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv) assert agent.episode == 50 agent.policy(env.observation_space.sample()) # Check counts if FeatMapClass != OneHotFeatureMap: return S = env.observation_space.n A = env.action_space.n N_sa = np.zeros((S, A)) for state, action in zip(agent.state_hist, agent.action_hist): N_sa[state, action] += 1.0 assert np.allclose(agent.lambda_mat_inv.diagonal(), 1.0/(N_sa.flatten()+reg_factor)) for ss in range(S): for aa in range(A): feat = agent.feature_map.map(ss, aa) assert np.allclose(feat @ (agent.lambda_mat_inv.T @ feat), 1.0/(N_sa[ss, aa]+reg_factor))
def test_rlsvi(gamma, stage_dependent): env = GridWorld(walls=(), nrows=5, ncols=5) agent = RLSVIAgent(env, horizon=11, stage_dependent=stage_dependent, gamma=gamma) agent.fit(budget=50) agent.policy(env.observation_space.sample())
def test_optql(): env = GridWorld(walls=(), nrows=5, ncols=5) agent = OptQLAgent(env, n_episodes=50, horizon=11, gamma=0.99, bonus_scale_factor=0.1) agent.fit() agent.policy(env.observation_space.sample())
def test_gridworld_from_layout(): layout = """ IOOOO # OOOOO O OOOOR OOOOO # OOOOO # OOOOO OOOOO O OOOOO # OOTOO OOOOO # OOOOO # OOOOO IOOOO # OOOOO # OOOOr""" env = GridWorld.from_layout(layout) env.reset()
def test_discrete2onehot(): env = DiscreteToOneHotWrapper(GridWorld()) env.reseed(123) assert isinstance(env.observation_space, spaces.Box) for ii in range(env.unwrapped.observation_space.n): initial_distr = np.zeros(env.unwrapped.observation_space.n) initial_distr[ii] = 1.0 env.unwrapped.set_initial_state_distribution(initial_distr) obs = env.reset() assert np.array_equal(obs, initial_distr)
def test_ucbvi(gamma, stage_dependent, real_time_dp): env = GridWorld(walls=(), nrows=5, ncols=5) agent = UCBVIAgent(env, n_episodes=50, horizon=11, stage_dependent=stage_dependent, gamma=gamma, real_time_dp=real_time_dp, bonus_scale_factor=0.1) agent.fit() agent.policy(env.observation_space.sample())
def test_ucbvi(gamma, stage_dependent, bernoullized_reward): env = GridWorld(walls=(), nrows=5, ncols=5) agent = PSRLAgent( env, horizon=11, bernoullized_reward=bernoullized_reward, stage_dependent=stage_dependent, gamma=gamma, ) agent.fit(budget=50) agent.policy(env.observation_space.sample())
def test_lsvi_ucb_matrix_inversion(FeatMapClass): env = GridWorld(nrows=3, ncols=3, walls=()) def feature_map_fn(): return FeatMapClass(env.observation_space.n, env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=10, feature_map_fn=feature_map_fn, horizon=10) agent.fit() assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv) assert agent.episode == 10 agent.policy(env.observation_space.sample())
def test_uncertainty_est_wrapper(): env = GridWorld() def uncertainty_est_fn(observation_space, action_space): return DiscreteCounter(observation_space, action_space) w_env = UncertaintyEstimatorWrapper(env, uncertainty_est_fn, bonus_scale_factor=1.0) for ii in range(10): w_env.reset() _, _, _, info = w_env.step(0) nn = w_env.uncertainty_estimator.count(0, 0) assert nn == ii + 1 assert info["exploration_bonus"] == pytest.approx(1 / np.sqrt(nn))
def test_mc_policy_eval(gamma, horizon, stationary_policy): env = GridWorld(nrows=3, ncols=3, start_coord=(0, 0), success_probability=1.0, walls=(), default_reward=0.0, reward_at={(2, 2): 1.0}) agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon) agent.fit() episode_rewards = mc_policy_evaluation(agent, env, n_sim=5, gamma=gamma, stationary_policy=stationary_policy) assert episode_rewards.mean() == 1.0 * np.power(gamma, 4)
def _get_filled_replay(max_replay_size): """runs env for ~ 2 * max_replay_size timesteps.""" env = GridWorld(terminal_states=None) env = TimeLimit(env, max_episode_steps=200) env.reseed(123) rng = np.random.default_rng(456) buffer = replay.ReplayBuffer( max_replay_size, rng, max_episode_steps=env._max_episode_steps, enable_prioritized=True, ) buffer.setup_entry("observations", np.float32) buffer.setup_entry("actions", np.uint32) buffer.setup_entry("rewards", np.float32) buffer.setup_entry("dones", bool) # Fill the replay buffer total_time = 0 while True: if total_time > 2 * buffer._max_replay_size: break done = False obs = env.reset() while not done: total_time += 1 action = env.action_space.sample() next_obs, reward, done, _ = env.step(action) buffer.append({ "observations": obs, "actions": action, "rewards": reward, "dones": done, }) obs = next_obs if done: buffer.end_episode() return buffer, env
def reset(self): self.state = GridWorld.reset(self) state_to_return = self.state if self.array_observation: state_to_return = self._convert_index_to_float_coord(self.state) return state_to_return
from rlberry.agents.ucbvi import UCBVIAgent from rlberry.agents.optql import OptQLAgent from rlberry.envs.finite import GridWorld from rlberry.stats import AgentStats, plot_episode_rewards from rlberry.stats import MultipleStats N_EP = 3000 HORIZON = 20 GAMMA = 1.0 env = GridWorld(nrows=5, ncols=10) params = {} params['ucbvi'] = { 'n_episodes': N_EP, 'horizon': HORIZON, 'stage_dependent': True, 'gamma': GAMMA, 'real_time_dp': True, 'bonus_scale_factor': 1.0, } params['optql'] = { 'n_episodes': N_EP, 'horizon': HORIZON, 'gamma': GAMMA, 'bonus_scale_factor': 1.0, } mstats = MultipleStats()
def test_lsvi_without_bonus(): seeding.set_global_seed(123) def lsvi_debug_gather_data(agent): """ Function to gather data sampling uniformly states and actions """ N = agent.n_episodes*agent.horizon count = 0 while count < N: state = agent.env.observation_space.sample() action = agent.env.action_space.sample() next_state, reward, done, info = agent.env.sample(state, action) # # feat = agent.feature_map.map(state, action) outer_prod = np.outer(feat, feat) inv = agent.lambda_mat_inv # agent.lambda_mat += np.outer(feat, feat) # update inverse agent.lambda_mat_inv -= \ (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) # update history agent.reward_hist[count] = reward agent.state_hist.append(state) agent.action_hist.append(action) agent.nstate_hist.append(next_state) # tt = agent.total_time_steps agent.feat_hist[tt, :] = agent.feature_map.map(state, action) for aa in range(agent.env.action_space.n): agent.feat_ns_all_actions[tt, aa, :] = \ agent.feature_map.map(next_state, aa) # increments agent.total_time_steps += 1 count += 1 env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=100, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5) lsvi_debug_gather_data(agent) # estimated Q S = env.observation_space.n Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) agent_opt.fit() Q = agent_opt.Q[0, :, :] print(Q) print("---") print(Q_est) print("-------") print(np.abs(Q-Q_est)) # Check error assert Q_est == pytest.approx(Q, rel=0.01)
from rlberry.envs.finite import GridWorld env = GridWorld(7, 10, walls=((2, 2), (3, 3))) env.enable_rendering() for tt in range(50): env.step(env.action_space.sample()) env.render()
from rlberry.agents.dynprog import ValueIterationAgent from rlberry.envs.finite import GridWorld env = GridWorld(7, 10, walls=((2, 2), (3, 3))) agent = ValueIterationAgent(env, gamma=0.95) info = agent.fit() print(info) env.enable_rendering() state = env.reset() for tt in range(200): action = agent.policy(state) next_s, _, done, _ = env.step(action) if done: break state = next_s env.save_video("gridworld.mp4", framerate=5)
import numpy as np from rlberry.agents.features import FeatureMap from rlberry.envs.finite import GridWorld from rlberry.stats import AgentStats, plot_episode_rewards,\ compare_policies from rlberry.agents.dynprog import ValueIterationAgent from rlberry.agents.linear import LSVIUCBAgent # Define environment env = GridWorld(nrows=2, ncols=4, walls=(), success_probability=1.0) # Create feature map class OneHotFeatureMap(FeatureMap): def __init__(self, S, A): self.S = env.observation_space.n self.A = env.action_space.n self.shape = (S * A, ) def map(self, observation, action): feat = np.zeros((self.S, self.A)) feat[observation, action] = 1.0 return feat.flatten() # Function that returns an instance of a feature map def feature_map_fn(env): return OneHotFeatureMap(env.observation_space.n, env.action_space.n) params = {
def test_gridworld_aux_functions(): env = GridWorld(nrows=5, ncols=8, walls=((1, 1), ), reward_at={ (4, 4): 1, (4, 3): -1 }) env.log() # from FiniteMDP env.render_ascii() # from GridWorld vals = np.arange(env.observation_space.n) env.display_values(vals) env.print_transition_at(0, 0, "up") layout = env.get_layout_array(vals, fill_walls_with=np.inf) for rr in range(env.nrows): for cc in range(env.ncols): if (rr, cc) in env.walls: assert layout[rr, cc] == np.inf else: assert layout[rr, cc] == vals[env.coord2index[(rr, cc)]]
def __init__( self, nrooms=7, reward_free=False, array_observation=False, room_size=5, success_probability=0.95, remove_walls=False, initial_state_distribution="center", include_traps=False, ): assert nrooms > 0, "nrooms must be > 0" assert initial_state_distribution in ("center", "uniform") self.reward_free = reward_free self.array_observation = array_observation self.nrooms = nrooms self.room_size = room_size self.success_probability = success_probability self.remove_walls = remove_walls self.initial_state_distribution = initial_state_distribution self.include_traps = include_traps # Max number of rooms/columns per row self.max_rooms_per_row = 5 # Room size (default = 5x5) self.room_size = room_size # Grid size self.room_nrows = math.ceil(nrooms / self.max_rooms_per_row) if self.room_nrows > 1: self.room_ncols = self.max_rooms_per_row else: self.room_ncols = nrooms nrows = self.room_size * self.room_nrows + (self.room_nrows - 1) ncols = self.room_size * self.room_ncols + (self.room_ncols - 1) # # walls walls = [] for room_col in range(self.room_ncols - 1): col = (room_col + 1) * (self.room_size + 1) - 1 for jj in range(nrows): if (jj % (self.room_size + 1)) != (self.room_size // 2): walls.append((jj, col)) for room_row in range(self.room_nrows - 1): row = (room_row + 1) * (self.room_size + 1) - 1 for jj in range(ncols): walls.append((row, jj)) # process each room start_coord = None terminal_state = None self.traps = [] count = 0 for room_r in range(self.room_nrows): if room_r % 2 == 0: cols_iterator = range(self.room_ncols) else: cols_iterator = reversed(range(self.room_ncols)) for room_c in cols_iterator: # existing rooms if count < self.nrooms: # remove top wall if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) or ( (room_c == 0) and (room_r % 2 == 1) ): if room_r != self.room_nrows - 1: wall_to_remove = self._convert_room_coord_to_global( room_r, room_c, self.room_size, self.room_size // 2 ) if wall_to_remove in walls: walls.remove(wall_to_remove) # rooms to remove else: for ii in range(-1, self.room_size + 1): for jj in range(-1, self.room_size + 1): wall_to_include = self._convert_room_coord_to_global( room_r, room_c, ii, jj ) if ( wall_to_include[0] >= 0 and wall_to_include[0] < nrows and wall_to_include[1] >= 0 and wall_to_include[1] < ncols and (wall_to_include not in walls) ): walls.append(wall_to_include) pass # start coord if count == nrooms // 2: start_coord = self._convert_room_coord_to_global( room_r, room_c, self.room_size // 2, self.room_size // 2 ) # terminal state if count == nrooms - 1: terminal_state = self._convert_room_coord_to_global( room_r, room_c, self.room_size // 2, self.room_size // 2 ) # trap if include_traps: self.traps.append( self._convert_room_coord_to_global( room_r, room_c, self.room_size // 2 + 1, self.room_size // 2 + 1, ) ) count += 1 terminal_states = (terminal_state,) + tuple(self.traps) if self.reward_free: reward_at = {} else: reward_at = { terminal_state: 1.0, start_coord: 0.01, (self.room_size // 2, self.room_size // 2): 0.1, } # Check remove_walls if remove_walls: walls = () # Init base class GridWorld.__init__( self, nrows=nrows, ncols=ncols, start_coord=start_coord, terminal_states=terminal_states, success_probability=success_probability, reward_at=reward_at, walls=walls, default_reward=0.0, ) # Check initial distribution if initial_state_distribution == "uniform": distr = np.ones(self.observation_space.n) / self.observation_space.n self.set_initial_state_distribution(distr) # spaces if self.array_observation: self.discrete_observation_space = self.observation_space self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
Illustration of how to set up an MBQVI algorithm in rlberry. The environment chosen here is GridWorld environment. .. video:: ../../video_plot_mbqvi.mp4 :width: 600 """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_mbqvi.jpg' from rlberry.agents.mbqvi import MBQVIAgent from rlberry.envs.finite import GridWorld params = {} params["n_samples"] = 100 # samples per state-action pair params["gamma"] = 0.99 params["horizon"] = None env = GridWorld(7, 10, walls=((2, 2), (3, 3)), success_probability=0.6) agent = MBQVIAgent(env, **params) info = agent.fit() print(info) # evaluate policy in a deterministic version of the environment env_eval = GridWorld(7, 10, walls=((2, 2), (3, 3)), success_probability=1.0) env_eval.enable_rendering() state = env_eval.reset() for tt in range(50): action = agent.policy(state) next_s, _, _, _ = env_eval.step(action) state = next_s video = env_eval.save_video("_video/video_plot_mbqvi.mp4")
from rlberry.agents.dynprog import ValueIterationAgent from rlberry.envs.finite import GridWorld, Chain for env in [Chain(), GridWorld(7, 10, walls=((2, 2), (3, 3)))]: agent = ValueIterationAgent(env, gamma=0.95) info = agent.fit() print(info) env.enable_rendering() state = env.reset() for tt in range(50): action = agent.policy(state) next_s, _, done, _ = env.step(action) if done: break state = next_s env.render()