def test_step_limit_with_vectorized_env(batch_size): start = 0 target = 10 starting_values = [start for i in range(batch_size)] targets = [target for i in range(batch_size)] env = SyncVectorEnv([ partial(DummyEnvironment, start=start, target=target, max_value=target * 2) for start, target in zip(starting_values, targets) ]) env = ObservationLimit(env, max_steps=3 * batch_size) obs = env.reset() obs, reward, done, info = env.step(env.action_space.sample()) # obs, reward, done, info = env.step(env.action_space.sample()) obs = env.reset() assert env.is_closed with pytest.raises(gym.error.ClosedEnvironmentError): env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.step(env.action_space.sample())
def test_not_setting_max_steps_per_episode_with_vector_env_raises_warning( self, env_name: str, batch_size: int): from functools import partial from gym.vector import SyncVectorEnv env = SyncVectorEnv( [partial(gym.make, env_name) for i in range(batch_size)]) with pytest.warns(UserWarning): dataset = self.EnvDataset(env) env.close()
def test_episode_limit_with_vectorized_env_dataset(batch_size): """ Test that when adding the EpisodeLimit wrapper on top of a vectorized environment, the episode limit is with respect to each individual env rather than the batched env. """ start = 0 target = 10 starting_values = [start for i in range(batch_size)] targets = [target for i in range(batch_size)] env = SyncVectorEnv([ partial(DummyEnvironment, start=start, target=target, max_value=10 * 2) for start, target in zip(starting_values, targets) ]) max_episodes = 2 # TODO: For some reason the reverse order doesn't work! env = EpisodeLimit(env, max_episodes=max_episodes * batch_size) env = EnvDataset(env) for i, obs in enumerate(env): print(i, obs) actions = np.ones(batch_size) reward = env.send(actions) assert i == max_episodes * target - 1 with pytest.raises(gym.error.ClosedEnvironmentError): env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): for i, obs in enumerate(env): print(i, obs) actions = np.ones(batch_size) reward = env.send(actions)
def test_shapes_are_correct_env_with_continuous_obs_and_discrete_action_spaces_vector( self, _, n_envs, n_steps): env = SyncVectorEnv( [lambda: gym.make("CartPole-v0") for _ in range(n_envs)]) observation_shape = env.observation_space.shape loop = EnvironmentLoop(env, self._create_discrite_policy(env)) batch = loop.step() for _ in range(1, n_steps): batch = loop.step() self._assert_has_shapes( batch, expected={ SampleBatch.OBSERVATIONS: observation_shape, SampleBatch.OBSERVATION_NEXTS: observation_shape, }, default=(n_envs, ), ) self._assert_has_dtype( batch, expected={ SampleBatch.ACTIONS: torch.int64, SampleBatch.EPS_ID: torch.int64, }, default=torch.float32, )
def test_shapes_are_correct_env_with_continuous_action_spaces_vector_sample( self, _, n_envs, n_steps): env_name = "MountainCarContinuous-v0" env = SyncVectorEnv( [lambda: gym.make(env_name) for _ in range(n_envs)]) observation_shape = (n_envs, ) + env.envs[0].observation_space.shape action_shape = (n_envs, ) + env.envs[0].action_space.shape loop = EnvironmentLoop(env, self._create_continuouse_policy(env)) batch = loop.sample() for _ in range(1, n_steps): batch = loop.sample() self._assert_has_shapes( batch, expected={ SampleBatch.OBSERVATIONS: observation_shape, SampleBatch.OBSERVATION_NEXTS: observation_shape, SampleBatch.ACTIONS: action_shape, }, default=(n_envs, ), ) self._assert_has_dtype( batch, expected={ SampleBatch.EPS_ID: torch.int64, }, default=torch.float32, )
def create_env(self, env_kwargs): def thunk(): import experiments.test_lstm_a2c return RewardCollector(gym.make(**env_kwargs)) env = AsyncVectorEnv([thunk] * self.num_processes) self.validation_env = SyncVectorEnv([thunk]) return env
def create_unreal_env(num_processes, kwargs): def thunk(env): env = gym.make(**env) env = RewardCollector(env) env = TransposeImage(env) env = ScaledFloatFrame(env) env = UnrealEnvBaseWrapper(env) return env return AsyncVectorEnv([lambda: thunk(kwargs) for _ in range(num_processes)]), SyncVectorEnv([lambda: thunk(kwargs)])
def get_multi_task_env( batch_size: int = 1, ) -> Environment[RLSetting.Observations, RLSetting.Actions, RLSetting.Rewards]: def single_env_fn() -> gym.Env: env = gym.make("CartPole-v0") env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: {"length": 0.1}, 100: {"length": 0.2}, 200: {"length": 0.3}, 300: {"length": 0.4}, 400: {"length": 0.5}, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) return env batch_size = 1 env = SyncVectorEnv([single_env_fn for _ in range(batch_size)]) from sequoia.common.gym_wrappers import AddDoneToObservation from sequoia.settings.active import TypedObjectsWrapper env = AddDoneToObservation(env) # Wrap the observations so they appear as though they are from the given setting. env = TypedObjectsWrapper( env, observations_type=RLSetting.Observations, actions_type=RLSetting.Actions, rewards_type=RLSetting.Rewards, ) env.seed(123) return env
def test_task_sequence_is_reproducible(env: str): """Test that the multi-task setup is seeded correctly, i.e. that the task sequence is reproducible given the same seed. """ if env == "cartpole": env_fn = env_fn_cartpole elif env == "monsterkong": env_fn = env_fn_monsterkong else: assert False, f"just testing on cartpole and monsterkong for now, but got env {env}" batch_size = 1 first_results: List[Tuple[int, int]] = [] n_runs = 5 n_episodes_per_run = 10 for run_number in range(n_runs): print(f"starting run {run_number} / {n_runs}") # For each 'run', we record the task sequence and how long each task lasted for. # Then, we want to check that each run was indentical, for a given seed. env = SyncVectorEnv([env_fn for _ in range(batch_size)]) env.seed(123) task_ids: List[int] = [] task_lengths: List[int] = [] for episode in range(n_episodes_per_run): print(f"Episode {episode} / {n_episodes_per_run}") obs = env.reset() task_id: int = obs[1][0] task_length = 0 done = False while not done: obs, _, done_array, _ = env.step(env.action_space.sample()) assert len(done_array) == 1 done = done_array[0] task_length += 1 task_ids.append(task_id) task_lengths.append(task_length) task_ids_and_lengths = list(zip(task_ids, task_lengths)) print(f"Task ids and length of each one: {task_ids_and_lengths}") assert len( set(task_ids)) > 1, "should have been more than just one task!" if not first_results: first_results = task_ids_and_lengths else: # Make sure that the results from this run are equivalent to the others with # the same seed: assert task_ids_and_lengths == first_results
def test_measure_RL_performance_batched_env(): batch_size = 3 start = [i for i in range(batch_size)] target = 5 env = EnvDataset( SyncVectorEnv([ partial(DummyEnvironment, start=start[i], target=target, max_value=target * 2) for i in range(batch_size) ])) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for step, obs in enumerate(itertools.islice(env, 100)): print(f"step {step} obs: {obs}") action = np.ones(batch_size) # always increment the counter reward = env.send(action) print(env.done_) # print(obs, reward, done, info) assert step == 99 from collections import defaultdict from sequoia.common.metrics import Metrics expected_metrics = defaultdict(Metrics) for i in range(101): for env_index in range(batch_size): if i and i % target == 0: expected_metrics[i] += EpisodeMetrics( n_samples=1, mean_episode_reward= 10., # ? FIXME: Actually understand this condition mean_episode_length=target, ) # FIXME: This test is a bit too complicated, hard to follow. I'll keep the # batches synced-up for now. # if i > 0 and (i + env_index) % target == 0: # expected_metrics[i] += EpisodeMetrics( # n_samples=1, # mean_episode_reward=sum(target - (i + env_index % target) for j in range(start[env_index], target)), # mean_episode_length=target - start[env_index] - 1 # ) assert env.get_online_performance() == expected_metrics
def create_env(self, env): class W(gym.ObservationWrapper): def observation(self, o): return o.astype(np.float32) env_kwargs = env def _thunk(): env = gym.make(**env_kwargs) env = RewardCollector(env) env = gym.wrappers.TransformReward(env, lambda r: 0.01 * r) env = W(env) return env self.validation_environment = SyncVectorEnv([_thunk]) return AsyncVectorEnv([_thunk for _ in range(self.num_processes)])
def test_step_limit_with_vectorized_env_partial_final_batch(batch_size): """ In the case where the batch size isn't a multiple of the max observations, the env returns ceil(max_obs / batch_size) * batch_size observations in total. TODO: If we ever get to few-shot learning or something like that, we might have to care about this. """ start = 0 target = 10 starting_values = [start for i in range(batch_size)] targets = [target for i in range(batch_size)] env = SyncVectorEnv([ partial(DummyEnvironment, start=start, target=target, max_value=target * 2) for start, target in zip(starting_values, targets) ]) env = ObservationLimit(env, max_steps=3 * batch_size + 1) obs = env.reset() assert not env.is_closed obs, reward, done, info = env.step(env.action_space.sample()) obs, reward, done, info = env.step(env.action_space.sample()) assert not env.is_closed # obs, reward, done, info = env.step(env.action_space.sample()) obs = env.reset() assert env.is_closed with pytest.raises(gym.error.ClosedEnvironmentError): env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.step(env.action_space.sample())
def _create_env_loop(self, env_name, n_envs=None, fetch_agent_info=None): if n_envs is None: env = gym.make(env_name) else: env = SyncVectorEnv( [lambda: gym.make(env_name) for _ in range(n_envs)]) if env_name == "MountainCarContinuous-v0": return EnvironmentLoop(env, self._create_continuouse_policy(env), fetch_agent_info=fetch_agent_info) if env_name == "Taxi-v3" or "CartPole" in env_name: return EnvironmentLoop(env, self._create_discrite_policy(env), fetch_agent_info=fetch_agent_info) raise RuntimeError("Unknown env", env_name)
def test_shapes_are_correct_env_with_discrete_obs_and_action_spaces_vector_env( self, _, n_envs, n_steps): env = SyncVectorEnv( [lambda: gym.make("Taxi-v3") for _ in range(n_envs)]) loop = EnvironmentLoop(env, self._create_discrite_policy(env)) batch = loop.step() for _ in range(1, n_steps): batch = loop.step() self._assert_has_shapes(batch, default=(n_envs, )) self._assert_has_dtype( batch, expected={ SampleBatch.REWARDS: torch.float32, SampleBatch.DONES: torch.float32, }, default=torch.int64, )
def test_episode_limit_with_vectorized_env(batch_size): """ Test that when adding the EpisodeLimit wrapper on top of a vectorized environment, the episode limit is with respect to each individual env rather than the batched env. """ starting_values = [0 for i in range(batch_size)] targets = [10 for i in range(batch_size)] env = SyncVectorEnv([ partial(DummyEnvironment, start=start, target=target, max_value=10 * 2) for start, target in zip(starting_values, targets) ]) env = EpisodeLimit(env, max_episodes=2 * batch_size) obs = env.reset() assert obs.tolist() == starting_values print("reset obs: ", obs) for i in range(10): print(i, obs) actions = np.ones(batch_size) obs, reward, done, info = env.step(actions) # all episodes end at step 10 assert all(done) # Because of how VectorEnvs work, the obs are the new 'reset' obs, rather # than the final obs in the episode. assert obs.tolist() == starting_values assert obs.tolist() == starting_values print("reset obs: ", obs) for i in range(10): print(i, obs) actions = np.ones(batch_size) obs, reward, done, info = env.step(actions) # all episodes end at step 10 assert all(done) assert env.is_closed assert obs.tolist() == starting_values with pytest.raises(gym.error.ClosedEnvironmentError): actions = np.ones(batch_size) obs, reward, done, info = env.step(actions)
def test_reset_vectorenv_with_unfinished_episodes_raises_warning(batch_size): """ Test that when adding the EpisodeLimit wrapper on top of a vectorized environment, the episode limit is with respect to each individual env rather than the batched env. """ start = 0 target = 10 starting_values = [start for i in range(batch_size)] targets = [target for i in range(batch_size)] env = SyncVectorEnv([ partial(DummyEnvironment, start=start, target=target, max_value=10 * 2) for start, target in zip(starting_values, targets) ]) env = EpisodeLimit(env, max_episodes=3 * batch_size) obs = env.reset() _ = env.step(env.action_space.sample()) _ = env.step(env.action_space.sample()) with pytest.warns(UserWarning) as record: env.reset()
def test_space_with_tuple_observations(batch_size: int, n_workers: Optional[int]): def make_env(): env = gym.make("Breakout-v0") env = MultiTaskEnvironment( env, add_task_id_to_obs=True, add_task_dict_to_info=True ) return env env_fn = make_env env_fns = [env_fn for _ in range(batch_size)] # from gym.vector.utils import batch_space # env = BatchedVectorEnv(env_fns, n_workers=n_workers) from gym.vector import SyncVectorEnv env = SyncVectorEnv(env_fns) # FIXME: debugging # env = AsyncVectorEnv(env_fns) env.seed(123) assert env.observation_space == spaces.Dict( x=spaces.Box(0, 255, (batch_size, 210, 160, 3), np.uint8), task_labels=spaces.MultiDiscrete(np.ones(batch_size)), ) assert env.single_observation_space == spaces.Dict( x=spaces.Box(0, 255, (210, 160, 3), np.uint8), task_labels=spaces.Discrete(1) ) obs = env.reset() assert obs["x"].shape == env.observation_space["x"].shape assert obs["task_labels"].shape == env.observation_space["task_labels"].shape assert obs in env.observation_space actions = env.action_space.sample() step_obs, rewards, done, info = env.step(actions) assert step_obs in env.observation_space assert len(rewards) == batch_size assert len(done) == batch_size assert all([isinstance(v, bool) for v in done.tolist()]), [type(v) for v in done] assert len(info) == batch_size
def _build_env(env_builder: EnvBuilder, n_envs: int) -> gym.Env: if n_envs > 1: return SyncVectorEnv([env_builder for _ in range(n_envs)]) else: return env_builder()
def main(cfg): random.seed(cfg.exp.seed) np.random.seed(cfg.exp.seed) torch.manual_seed(cfg.exp.seed) torch.backends.cudnn.deterministic = cfg.exp.torch_deterministic # so that the environment automatically resets env = SyncVectorEnv([ lambda: RecordEpisodeStatistics(gym.make('CartPole-v1')) ]) actor, critic = Actor(), Critic() actor_optim = Adam(actor.parameters(), eps=1e-5, lr=cfg.params.actor_lr) critic_optim = Adam(critic.parameters(), eps=1e-5, lr=cfg.params.critic_lr) memory = Memory(mini_batch_size=cfg.params.mini_batch_size, batch_size=cfg.params.batch_size) obs = env.reset() global_rewards = [] NUM_UPDATES = (cfg.params.total_timesteps // cfg.params.batch_size) * cfg.params.epochs cur_timestep = 0 def calc_factor(cur_timestep: int) -> float: """Calculates the factor to be multiplied with the learning rate to update it.""" update_number = cur_timestep // cfg.params.batch_size total_updates = cfg.params.total_timesteps // cfg.params.batch_size fraction = 1.0 - update_number / total_updates return fraction actor_scheduler = LambdaLR(actor_optim, lr_lambda=calc_factor, verbose=True) critic_scheduler = LambdaLR(critic_optim, lr_lambda=calc_factor, verbose=True) while cur_timestep < cfg.params.total_timesteps: # keep playing the game obs = torch.as_tensor(obs, dtype=torch.float32) with torch.no_grad(): dist = actor(obs) action = dist.sample() log_prob = dist.log_prob(action) value = critic(obs) action = action.cpu().numpy() value = value.cpu().numpy() log_prob = log_prob.cpu().numpy() obs_, reward, done, info = env.step(action) if done[0]: tqdm.write(f'Reward: {info[0]["episode"]["r"]}, Avg Reward: {np.mean(global_rewards[-10:]):.3f}') global_rewards.append(info[0]['episode']['r']) wandb.log({'Avg_Reward': np.mean(global_rewards[-10:]), 'Reward': info[0]['episode']['r']}) memory.remember(obs.squeeze(0).cpu().numpy(), action.item(), log_prob.item(), reward.item(), done.item(), value.item()) obs = obs_ cur_timestep += 1 # if the current timestep is a multiple of the batch size, then we need to update the model if cur_timestep % cfg.params.batch_size == 0: for epoch in tqdm(range(cfg.params.epochs), desc=f'Num updates: {cfg.params.epochs * (cur_timestep // cfg.params.batch_size)} / {NUM_UPDATES}'): # sample a batch from memory of experiences old_states, old_actions, old_log_probs, old_rewards, old_dones, old_values, batch_indices = memory.sample() old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32) old_actions = torch.tensor(old_actions, dtype=torch.float32) advantage = calculate_advantage(old_rewards, old_values, old_dones, gae_gamma=cfg.params.gae_gamma, gae_lambda=cfg.params.gae_lambda) advantage = torch.tensor(advantage, dtype=torch.float32) old_rewards = torch.tensor(old_rewards, dtype=torch.float32) old_values = torch.tensor(old_values, dtype=torch.float32) # for each mini batch from batch, calculate advantage using GAE for mini_batch_index in batch_indices: # remember: Normalization of advantage is done on mini batch, not the entire batch advantage[mini_batch_index] = (advantage[mini_batch_index] - advantage[mini_batch_index].mean()) / (advantage[mini_batch_index].std() + 1e-8) dist = actor(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)) # actions = dist.sample() log_probs = dist.log_prob(old_actions[mini_batch_index]).squeeze(0) entropy = dist.entropy().squeeze(0) log_ratio = log_probs - old_log_probs[mini_batch_index] ratio = torch.exp(log_ratio) with torch.no_grad(): # approx_kl = ((ratio-1)-log_ratio).mean() approx_kl = ((old_log_probs[mini_batch_index] - log_probs)**2).mean() wandb.log({'Approx_KL': approx_kl}) actor_loss = -torch.min( ratio * advantage[mini_batch_index], torch.clamp(ratio, 1 - cfg.params.actor_loss_clip, 1 + cfg.params.actor_loss_clip) * advantage[mini_batch_index] ).mean() values = critic(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)).squeeze(-1) returns = old_values[mini_batch_index] + advantage[mini_batch_index] critic_loss = torch.max( (values - returns)**2, (old_values[mini_batch_index] + torch.clamp( values - old_values[mini_batch_index], -cfg.params.critic_loss_clip, cfg.params.critic_loss_clip ) - returns )**2 ).mean() # critic_loss = F.mse_loss(values, returns) wandb.log({'Actor_Loss': actor_loss.item(), 'Critic_Loss': critic_loss.item(), 'Entropy': entropy.mean().item()}) loss = actor_loss + 0.25 * critic_loss - 0.01 * entropy.mean() actor_optim.zero_grad() critic_optim.zero_grad() loss.backward() nn.utils.clip_grad_norm_(actor.parameters(), cfg.params.max_grad_norm) nn.utils.clip_grad_norm_(critic.parameters(), cfg.params.max_grad_norm) actor_optim.step() critic_optim.step() memory.reset() actor_scheduler.step(cur_timestep) critic_scheduler.step(cur_timestep) y_pred, y_true = old_values.cpu().numpy(), (old_values + advantage).cpu().numpy() var_y = np.var(y_true) explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y wandb.log({'Explained_Var': explained_var}) if cfg.exp.save_weights: torch.save(actor.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/actor.pth')) torch.save(critic.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/critic.pth'))