def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: deterministic.set_seed(1) env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 60 env.close()
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 20 sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CMAES(env_spec=env.spec, policy=policy, sampler=sampler, n_samples=n_samples) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=1000) # No assertion on return because CMAES is not stable. env.close()
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e5)) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, max_episode_length=100, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
class TestDiscretePolicies(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = GymEnv(DummyDiscreteEnv()) def teardown_method(self): self.env.close() super().teardown_method() def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs) def test_categorical_lstm_policy(self): categorical_lstm_policy = CategoricalLSTMPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) categorical_lstm_policy.reset() obs = self.env.observation_space.high assert categorical_lstm_policy.get_action(obs) def test_categorial_mlp_policy(self): categorical_mlp_policy = CategoricalMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) obs = self.env.observation_space.high assert categorical_mlp_policy.get_action(obs)
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 10 sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CEM(env_spec=env.spec, policy=policy, sampler=sampler, best_frac=0.1, n_samples=n_samples) trainer.setup(algo, env) rtn = trainer.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with TFTrainer(snapshot_config, sess=self.sess) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) sampler = LocalSampler( agents=epilson_greedy_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, sampler=sampler, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/mlp/hidden_0', reuse=True): bias = tf.compat.v1.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.compat.v1.Session(graph=tf.Graph()): pickle.loads(h) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/mlp/hidden_0', reuse=True): new_bias = tf.compat.v1.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
def test_baseline(self): """Test the baseline initialization.""" box_env = GymEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) self.sess.run(tf.compat.v1.global_variables_initializer()) deterministic_mlp_baseline.get_param_values() gaussian_mlp_baseline.get_param_values() box_env.close()
def test_process_env_argument(): env = GymEnv(env=gym.make('MountainCar-v0')) env.close() env = GymEnv(env='MountainCar-v0') env.close() env = GymEnv(gym.make('MountainCar-v0')) env.close() env = GymEnv('MountainCar-v0') env.close() with pytest.raises(ValueError, match='GymEnv can take env'): env = GymEnv(1) env.close()
def dqn_cartpole(ctxt=None, seed=24): """Train DQN with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) runner = Trainer(ctxt) n_epochs = 100 steps_per_epoch = 10 sampler_batch_size = 512 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, discount=0.9, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64) runner.setup(algo, env) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) env.close()
def test_dqn_cartpole_grad_clip(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with TFTrainer(snapshot_config, sess=self.sess) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) sampler = LocalSampler( agents=epilson_greedy_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, sampler=sampler, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 8.8 env.close()
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) trainer = Trainer(snapshot_config) env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 10 algo = CEM(env_spec=env.spec, policy=policy, best_frac=0.1, n_samples=n_samples) trainer.setup(algo, env, sampler_cls=LocalSampler) rtn = trainer.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CMAES(env_spec=env.spec, policy=policy, baseline=baseline, n_samples=n_samples) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=1, batch_size=1000) # No assertion on return because CMAES is not stable. env.close()
def test_dqn_cartpole_double_q(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 8.8 env.close()
class TestContinuousPolicies(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = GymEnv(DummyBoxEnv()) def teardown_method(self): self.env.close() super().teardown_method() def test_continuous_mlp_policy(self): continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) obs = self.env.observation_space.high assert continuous_mlp_policy.get_action(obs) def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1, state_include_action=False) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs) def test_gaussian_lstm_policy(self): gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env, hidden_dim=1, state_include_action=False) gaussian_lstm_policy.reset() obs = self.env.observation_space.high assert gaussian_lstm_policy.get_action(obs) def test_gaussian_mlp_policy(self): gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) obs = self.env.observation_space.high assert gaussian_mlp_policy.get_action(obs)
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: deterministic.set_seed(1) env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 60 env.close()
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 10 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.1, max_episode_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=LocalSampler) rtn = runner.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
class TestVPG: """Test class for VPG.""" @classmethod def setup_class(cls): """Setup method which is called once before all tests in this class.""" deterministic.set_seed(0) def setup_method(self): """Setup method which is called before every test.""" self._env = GymEnv('InvertedDoublePendulum-v2') self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'value_function': GaussianMLPValueFunction(env_spec=self._env.spec), 'max_episode_length': 100, 'discount': 0.99, } def teardown_method(self): """Teardown method which is called after every test.""" self._env.close() @pytest.mark.mujoco def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env, sampler_cls=LocalSampler) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env, sampler_cls=LocalSampler) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env, sampler_cls=LocalSampler) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG) def test_invalid_entropy_config(self, algo_param, error, msg): """Test VPG with invalid entropy config.""" self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)
#!/usr/bin/env python3 """Example of how to load, step, and visualize an environment.""" import argparse from garage.envs import GymEnv parser = argparse.ArgumentParser() parser.add_argument('--n_steps', type=int, default=1000, help='Number of steps to run') args = parser.parse_args() # Construct the environment env = GymEnv('MountainCar-v0') # Reset the environment and launch the viewer env.reset() env.visualize() step_count = 0 es = env.step(env.action_space.sample()) while not es.last and step_count < args.n_steps: es = env.step(env.action_space.sample()) step_count += 1 env.close()
def test_get_time_limit_max_ep_len_not_equal_timeout(): env = GymEnv('CartPoleBulletEnv-v1', max_episode_length=1) assert env.spec.max_episode_length == 1 env.close()
def test_closes_mujoco(): garage_env = GymEnv('Ant-v2') garage_env.visualize() assert garage_env._env.viewer is not None garage_env.close() assert garage_env._env.viewer is None
def test_closes_box2d(): garage_env = GymEnv('CarRacing-v0') garage_env.visualize() assert garage_env._env.viewer is not None garage_env.close() assert garage_env._env.viewer is None
def test_get_time_limit_detects_inconsistency(): with pytest.raises(RuntimeError): env = GymEnv('CartPoleBulletEnv-v1', max_episode_length=math.inf) env.close()