def test_set_env(model_class, policy_class): """ Test if set_env function does work correct :param model_class: (BaseAlgorithm) A RL model """ # use discrete for DQN env = DummyVecEnv([lambda: select_env(model_class)]) env2 = DummyVecEnv([lambda: select_env(model_class)]) env3 = select_env(model_class) kwargs = {} if model_class in {DQN, DDPG, SAC, TD3}: kwargs = dict(learning_starts=0, train_freq=4) elif model_class in {A2C, PPO}: kwargs = dict(n_steps=64) # create model model = model_class(policy_class, env, policy_kwargs=dict(net_arch=[16]), **kwargs) # learn model.learn(total_timesteps=N_STEPS_SMALL) # change env model.set_env(env2) # learn again model.learn(total_timesteps=N_STEPS_SMALL) # change env test wrapping model.set_env(env3) # learn again model.learn(total_timesteps=N_STEPS_SMALL)
def test_get_max_episode_length(): dict_env = DummyVecEnv([lambda: BitFlippingEnv()]) # Cannot infer max epsiode length with pytest.raises(ValueError): get_time_limit(dict_env, current_max_episode_length=None) default_length = 10 assert get_time_limit( dict_env, current_max_episode_length=default_length) == default_length env = gym.make("CartPole-v1") vec_env = DummyVecEnv([lambda: env]) assert get_time_limit(vec_env, current_max_episode_length=None) == 500 # Overwrite max_episode_steps assert get_time_limit( vec_env, current_max_episode_length=default_length) == default_length # Set max_episode_steps to None env.spec.max_episode_steps = None vec_env = DummyVecEnv([lambda: env]) with pytest.raises(ValueError): get_time_limit(vec_env, current_max_episode_length=None) # Initialize HER and specify max_episode_length, should not raise an issue HER(MlpPolicyDQN, dict_env, DQN, max_episode_length=5) with pytest.raises(ValueError): HER(MlpPolicyDQN, dict_env, DQN) # Wrapped in a timelimit, should be fine # Note: it requires env.spec to be defined env = DummyVecEnv([lambda: gym.wrappers.TimeLimit(BitFlippingEnv(), 10)]) HER(MlpPolicyDQN, env, DQN)
def test_offpolicy_normalization(model_class, policy_class): make_env_ = make_dict_env if model_class == HER else make_env env = DummyVecEnv([make_env_]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0) eval_env = DummyVecEnv([make_env_]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0) kwargs = dict(model_class=SAC, max_episode_length=200, online_sampling=True) if model_class == HER else {} model = model_class(policy_class, env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]), **kwargs) model.learn(total_timesteps=500, eval_env=eval_env, eval_freq=250) # Check getter assert isinstance(model.get_vec_normalize_env(), VecNormalize)
def _make_warmstart_dict_env(): """Warm-start VecNormalize by stepping through BitFlippingEnv""" venv = DummyVecEnv([make_dict_env]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def test_vec_env(tmp_path, make_env): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) if isinstance(obs, dict): for key in obs.keys(): assert np.max(np.abs(obs[key])) <= clip_obs else: assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = tmp_path / "vec_normalize" norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def _wrap_env(env: "GymEnv", verbose: int = 0, monitor_wrapper: bool = True) -> VecEnv: """ " Wrap environment with the appropriate wrappers if needed. For instance, to have a vectorized environment or to re-order the image channels. :param env: :param verbose: :param monitor_wrapper: Whether to wrap the env in a ``Monitor`` when possible. :return: The wrapped environment. """ if not isinstance(env, VecEnv): if not is_wrapped(env, Monitor) and monitor_wrapper: if verbose >= 1: print("Wrapping the env with a `Monitor` wrapper") env = Monitor(env) if verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) if (is_image_space(env.observation_space) and not is_vecenv_wrapped(env, VecTransposeImage) and not is_image_space_channels_first(env.observation_space)): if verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) # check if wrapper for dict support is needed when using HER if isinstance(env.observation_space, spaces.Dict): env = ObsDictWrapper(env) return env
def test_predict(model_class, policy_class, env_id, device): if device == "cuda" and not th.cuda.is_available(): pytest.skip("CUDA not available") if env_id == "CartPole-v1": if model_class in [SAC, TD3]: return elif model_class in [DQN]: return # Test detection of different shapes by the predict method model = model_class(policy_class, env_id, device=device) # Check that the policy is on the right device assert get_device(device).type == model.policy.device.type env = gym.make(env_id) vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)]) obs = env.reset() action, _ = model.predict(obs) assert action.shape == env.action_space.shape assert env.action_space.contains(action) vec_env_obs = vec_env.reset() action, _ = model.predict(vec_env_obs) assert action.shape[0] == vec_env_obs.shape[0] # Special case for DQN to check the epsilon greedy exploration if model_class == DQN: model.exploration_rate = 1.0 action, _ = model.predict(obs, deterministic=False) assert action.shape == env.action_space.shape assert env.action_space.contains(action) action, _ = model.predict(vec_env_obs, deterministic=False) assert action.shape[0] == vec_env_obs.shape[0]
def test_vecenv_wrapper_getattr(): def make_env(): return CustomGymEnv(spaces.Box(low=np.zeros(2), high=np.ones(2))) vec_env = DummyVecEnv([make_env for _ in range(N_ENVS)]) wrapped = CustomWrapperA(CustomWrapperBB(vec_env)) assert wrapped.var_a == "a" assert wrapped.var_b == "b" assert wrapped.var_bb == "bb" assert wrapped.func_b() == "b" assert wrapped.name_test() == CustomWrapperBB double_wrapped = CustomWrapperA(CustomWrapperB(wrapped)) _ = double_wrapped.var_a # should not raise as it is directly defined here with pytest.raises(AttributeError): # should raise due to ambiguity _ = double_wrapped.var_b with pytest.raises(AttributeError): # should raise as does not exist _ = double_wrapped.nonexistent_attribute
def __init__( self, eval_env: Union[gym.Env, VecEnv], callback_on_new_best: Optional[BaseCallback] = None, n_eval_episodes: int = 5, eval_freq: int = 10000, log_path: str = None, best_model_save_path: str = None, deterministic: bool = True, render: bool = False, verbose: int = 1, warn: bool = True, ): super(EvalCallback, self).__init__(callback_on_new_best, verbose=verbose) self.n_eval_episodes = n_eval_episodes self.eval_freq = eval_freq self.best_mean_reward = -np.inf self.last_mean_reward = -np.inf self.deterministic = deterministic self.render = render self.warn = warn # Convert to VecEnv for consistency if not isinstance(eval_env, VecEnv): eval_env = DummyVecEnv([lambda: eval_env]) if isinstance(eval_env, VecEnv): assert eval_env.num_envs == 1, "You must pass only one environment for evaluation" self.eval_env = eval_env self.best_model_save_path = best_model_save_path # Logs will be written in ``evaluations.npz`` if log_path is not None: log_path = os.path.join(log_path, "evaluations") self.log_path = log_path self.evaluations_results = [] self.evaluations_timesteps = [] self.evaluations_length = [] # For computing success rate self._is_success_buffer = [] self.evaluations_successes = []
def test_eval_success_logging(tmp_path): n_bits = 2 env = BitFlippingEnv(n_bits=n_bits) eval_env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=n_bits)]) eval_callback = EvalCallback( ObsDictWrapper(eval_env), eval_freq=250, log_path=tmp_path, warn=False, ) model = HER(MlpPolicyDQN, env, DQN, learning_starts=100, seed=0, max_episode_length=n_bits) model.learn(500, callback=eval_callback) assert len(eval_callback._is_success_buffer) > 0 # More than 50% success rate assert np.mean(eval_callback._is_success_buffer) > 0.5
def test_exclude_include_saved_params(tmp_path, model_class, policy_class): """ Test if exclude and include parameters of save() work :param model_class: (BaseAlgorithm) A RL model """ env = DummyVecEnv([lambda: select_env(model_class)]) # create model, set verbose as 2, which is not standard model = model_class(policy_class, env, policy_kwargs=dict(net_arch=[16]), verbose=2) # Check if exclude works model.save(tmp_path / "test_save", exclude=["verbose"]) del model model = model_class.load(str(tmp_path / "test_save.zip")) # check if verbose was not saved assert model.verbose != 2 # set verbose as something different then standard settings model.verbose = 2 # Check if include works model.save(tmp_path / "test_save", exclude=["verbose"], include=["verbose"]) # Load with custom objects custom_objects = dict(learning_rate=2e-5, dummy=1.0) model = model_class.load(str(tmp_path / "test_save.zip"), custom_objects=custom_objects) assert model.verbose == 2 # Check that the custom object was taken into account assert model.learning_rate == custom_objects["learning_rate"] # Check that only parameters that are here already are replaced assert not hasattr(model, "dummy") model = model_class.load(str(tmp_path / "test_save.zip")) assert model.verbose == 2 # clear file from os os.remove(tmp_path / "test_save.zip")
def test_discrete(model_class, policy_class, env): env_ = DummyVecEnv([lambda: env]) kwargs = {} n_steps = 2000 if model_class == DQN: kwargs = dict(learning_starts=0) n_steps = 2000 # DQN only support discrete actions if isinstance(env, (IdentityEnvMultiDiscrete, IdentityEnvMultiBinary)): return model = model_class(policy_class, env_, gamma=0.4, seed=1, **kwargs).learn(n_steps) evaluate_policy(model, env_, n_eval_episodes=20, reward_threshold=70, warn=False) obs = env.reset() assert np.shape(model.predict(obs)[0]) == np.shape(obs)
def test_vec_env_is_wrapped(): # Test is_wrapped call of subproc workers def make_env(): return CustomGymEnv(spaces.Box(low=np.zeros(2), high=np.ones(2))) def make_monitored_env(): return Monitor( CustomGymEnv(spaces.Box(low=np.zeros(2), high=np.ones(2)))) # One with monitor, one without vec_env = SubprocVecEnv([make_env, make_monitored_env]) assert vec_env.env_is_wrapped(Monitor) == [False, True] vec_env.close() # One with monitor, one without vec_env = DummyVecEnv([make_env, make_monitored_env]) assert vec_env.env_is_wrapped(Monitor) == [False, True] vec_env = VecFrameStack(vec_env, n_stack=2) assert vec_env.env_is_wrapped(Monitor) == [False, True]
def test_check_nan(): """Test VecCheckNan Object""" env = DummyVecEnv([NanAndInfEnv]) env = VecCheckNan(env, raise_exception=True) env.step([[0]]) with pytest.raises(ValueError): env.step([[float("NaN")]]) with pytest.raises(ValueError): env.step([[float("inf")]]) with pytest.raises(ValueError): env.step([[-1]]) with pytest.raises(ValueError): env.step([[1]]) env.step(np.array([[0, 1], [0, 1]])) env.reset()
def test_save_load(tmp_path, model_class, policy_class): """ Test if 'save' and 'load' saves and loads model correctly and if 'get_parameters' and 'set_parameters' and work correctly. ''warning does not test function of optimizer parameter load :param model_class: (BaseAlgorithm) A RL model """ env = DummyVecEnv([lambda: select_env(model_class)]) # create model model = model_class(policy_class, env, policy_kwargs=dict(net_arch=[16]), verbose=1) model.learn(total_timesteps=N_STEPS_SMALL) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) # Get parameters of different objects # deepcopy to avoid referencing to tensors we are about to modify original_params = deepcopy(model.get_parameters()) # Test different error cases of set_parameters. # Test that invalid object names throw errors invalid_object_params = deepcopy(original_params) invalid_object_params[ "I_should_not_be_a_valid_object"] = "and_I_am_an_invalid_tensor" with pytest.raises(ValueError): model.set_parameters(invalid_object_params, exact_match=True) with pytest.raises(ValueError): model.set_parameters(invalid_object_params, exact_match=False) # Test that exact_match catches when something was missed. missing_object_params = dict( (k, v) for k, v in list(original_params.items())[:-1]) with pytest.raises(ValueError): model.set_parameters(missing_object_params, exact_match=True) # Test that exact_match catches when something inside state-dict # is missing but we have exact_match. missing_state_dict_tensor_params = {} for object_name in original_params: object_params = {} missing_state_dict_tensor_params[object_name] = object_params # Skip last item in state-dict for k, v in list(original_params[object_name].items())[:-1]: object_params[k] = v with pytest.raises(RuntimeError): # PyTorch load_state_dict throws RuntimeError if strict but # invalid state-dict. model.set_parameters(missing_state_dict_tensor_params, exact_match=True) # Test that parameters do indeed change. random_params = {} for object_name, params in original_params.items(): # Do not randomize optimizer parameters (custom layout) if "optim" in object_name: random_params[object_name] = params else: # Again, skip the last item in state-dict random_params[object_name] = OrderedDict( (param_name, th.rand_like(param)) for param_name, param in list(params.items())[:-1]) # Update model parameters with the new random values model.set_parameters(random_params, exact_match=False) new_params = model.get_parameters() # Check that all params except the final item in each state-dict are different. for object_name in original_params: # Skip optimizers (no valid comparison with just th.allclose) if "optim" in object_name: continue # state-dicts use ordered dictionaries, so key order # is guaranteed. last_key = list(original_params[object_name].keys())[-1] for k in original_params[object_name]: if k == last_key: # Should be same as before assert th.allclose( original_params[object_name][k], new_params[object_name][k] ), "Parameter changed despite not included in the loaded parameters." else: # Should be different assert not th.allclose( original_params[object_name][k], new_params[object_name] [k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # Check if the model loads as expected for every possible choice of device: for device in ["auto", "cpu", "cuda"]: model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device) # check if the model was loaded to the correct device assert model.device.type == get_device(device).type assert model.policy.device.type == get_device(device).type # check if params are still the same after load new_params = model.get_parameters() # Check that all params are the same as before save load procedure now for object_name in new_params: # Skip optimizers (no valid comparison with just th.allclose) if "optim" in object_name: continue for key in params[object_name]: assert new_params[object_name][key].device.type == get_device( device).type assert th.allclose( params[object_name][key].to("cpu"), new_params[object_name][key].to("cpu") ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=N_STEPS_SMALL) del model # clear file from os os.remove(tmp_path / "test_save.zip")
def test_save_load_q_net(tmp_path, model_class, policy_class): """ Test saving and loading q-network/quantile net only. :param model_class: (BaseAlgorithm) A RL model :param policy_str: (str) Name of the policy. """ kwargs = dict(policy_kwargs=dict(net_arch=[16])) if "Cnn" not in str(policy_class): # MlpPolicy env = select_env(model_class) else: if model_class in [DQN]: # Avoid memory error when using replay buffer # Reduce the size of the features kwargs = dict( buffer_size=250, learning_starts=100, policy_kwargs=dict(features_extractor_kwargs=dict( features_dim=32)), ) env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=model_class == DQN) env = DummyVecEnv([lambda: env]) # create model model = model_class(policy_class, env, verbose=1, **kwargs) model.learn(total_timesteps=N_STEPS_SMALL) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) q_net = model.q_net q_net_class = q_net.__class__ # Get dictionary of current parameters params = deepcopy(q_net.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values q_net.load_state_dict(random_params) new_params = q_net.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = q_net.predict(observations, deterministic=True) # Save and load q_net q_net.save(tmp_path / "q_net.pkl") del q_net q_net = q_net_class.load(tmp_path / "q_net.pkl") # check if params are still the same after load new_params = q_net.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Policy parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = q_net.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # clear file from os os.remove(tmp_path / "q_net.pkl")
def test_framestack_vecenv(): """Test that framestack environment stacks on desired axis""" image_space_shape = [12, 8, 3] zero_acts = np.zeros([N_ENVS] + image_space_shape) transposed_image_space_shape = image_space_shape[::-1] transposed_zero_acts = np.zeros([N_ENVS] + transposed_image_space_shape) def make_image_env(): return CustomGymEnv( spaces.Box( low=np.zeros(image_space_shape), high=np.ones(image_space_shape) * 255, dtype=np.uint8, )) def make_transposed_image_env(): return CustomGymEnv( spaces.Box( low=np.zeros(transposed_image_space_shape), high=np.ones(transposed_image_space_shape) * 255, dtype=np.uint8, )) def make_non_image_env(): return CustomGymEnv( spaces.Box(low=np.zeros((2, )), high=np.ones((2, )))) vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2) obs, _, _, _ = vec_env.step(zero_acts) vec_env.close() # Should be stacked on the last dimension assert obs.shape[-1] == (image_space_shape[-1] * 2) # Try automatic stacking on first dimension now vec_env = DummyVecEnv([make_transposed_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2) obs, _, _, _ = vec_env.step(transposed_zero_acts) vec_env.close() # Should be stacked on the first dimension (note the transposing in make_transposed_image_env) assert obs.shape[1] == (image_space_shape[-1] * 2) # Try forcing dimensions vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="last") obs, _, _, _ = vec_env.step(zero_acts) vec_env.close() # Should be stacked on the last dimension assert obs.shape[-1] == (image_space_shape[-1] * 2) vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="first") obs, _, _, _ = vec_env.step(zero_acts) vec_env.close() # Should be stacked on the first dimension assert obs.shape[1] == (image_space_shape[0] * 2) # Test invalid channels_order vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) with pytest.raises(AssertionError): vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="not_valid") # Test that it works with non-image envs when no channels_order is given vec_env = DummyVecEnv([make_non_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2)
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def _check_nan(env: gym.Env) -> None: """Check for Inf and NaN using the VecWrapper.""" vec_env = VecCheckNan(DummyVecEnv([lambda: env])) for _ in range(10): action = np.array([env.action_space.sample()]) _, _, _, _ = vec_env.step(action)