def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e5), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def test_trpo_cnn_cubecrash(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64), conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pad='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, max_kl_step=0.01, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > -0.9 env.close()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=5, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def test_ppo_pendulum_recurrent_continuous_baseline(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 10 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.1, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) rtn = runner.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
class TestSampler: """ Uses mock policy for 4x4 gridworldenv '4x4': [ 'SFFF', 'FHFH', 'FFFH', 'HFFG' ] 0: left 1: down 2: right 3: up -1: no move 'S' : starting point 'F' or '.': free space 'W' or 'x': wall 'H' or 'o': hole (terminates episode) 'G' : goal [2,2,1,0,3,1,1,1,2,2,1,1,1,2,2,1] """ def setup_method(self): self.env = TfEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock(env_spec=self.env.spec, policy=self.policy, max_path_length=16) def teardown_method(self): self.env.close() def test_ray_batch_sampler(self): sampler1 = SimpleSampler(self.) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples(0, 16) trajs2 = sampler2.obtain_samples(0, 1) assert (trajs1[0]['observations'].shape == np.array( trajs2[0]['observations']).shape == (6, )) traj2_action_shape = np.array(trajs2[0]['actions']).shape assert (trajs1[0]['actions'].shape == traj2_action_shape == (6, )) assert (sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) for trajectory in trajs1: assert (np.array_equal(trajectory['observations'], true_obs)) assert (np.array_equal(trajectory['actions'], true_actions)) assert (np.array_equal(trajectory['rewards'], true_rewards)) sampler1.shutdown_worker() sampler2.shutdown_worker()
def test_baseline(self): """Test the baseline initialization.""" box_env = TfEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) self.sess.run(tf.compat.v1.global_variables_initializer()) deterministic_mlp_baseline.get_param_values() gaussian_mlp_baseline.get_param_values() box_env.close()
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): bias = tf.compat.v1.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.compat.v1.Session(graph=tf.Graph()): pickle.loads(h) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): new_bias = tf.compat.v1.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
class TestContinuousPolicies(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(DummyBoxEnv()) def teardown_method(self): self.env.close() super().teardown_method() def test_continuous_mlp_policy(self): continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) obs = self.env.observation_space.high assert continuous_mlp_policy.get_action(obs) def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs) def test_gaussian_lstm_policy(self): gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_lstm_policy.reset() obs = self.env.observation_space.high assert gaussian_lstm_policy.get_action(obs) def test_gaussian_mlp_policy(self): gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) obs = self.env.observation_space.high assert gaussian_mlp_policy.get_action(obs)
class TestRaySamplerTF(): """ Uses mock policy for 4x4 gridworldenv '4x4': [ 'SFFF', 'FHFH', 'FFFH', 'HFFG' ] 0: left 1: down 2: right 3: up -1: no move 'S' : starting point 'F' or '.': free space 'W' or 'x': wall 'H' or 'o': hole (terminates episode) 'G' : goal [2,2,1,0,3,1,1,1,2,2,1,1,1,2,2,1] """ def setup_method(self): ray.init(local_mode=True, ignore_reinit_error=True) self.env = TfEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock(env_spec=self.env.spec, policy=self.policy, max_path_length=16) def teardown_method(self): self.env.close() def test_ray_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = RaySampler(workers, self.policy, self.env) sampler1.start_worker() sampler1.shutdown_worker()
class TestNormalizedGym: def setup_method(self): self.env = TfEnv( normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) def teardown_method(self): self.env.close() def test_does_not_modify_action(self): a = self.env.action_space.sample() a_copy = a self.env.reset() self.env.step(a) assert a == a_copy def test_flatten(self): for _ in range(10): self.env.reset() for _ in range(5): self.env.render() action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) assert next_obs.shape == self.env.observation_space.low.shape if done: break def test_unflatten(self): for _ in range(10): self.env.reset() for _ in range(5): action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) # yapf: disable assert (self.env.observation_space.flatten(next_obs).shape == self.env.observation_space.flat_dim) # yapf: enable if done: break
def test_dqn_cartpole_grad_clip(self): """Test DQN with CartPole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 15 env.close()
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_trpo_lstm_cartpole(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) snapshotter.snapshot_dir = './' runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close()
def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 80 env.close()
def test_on_policy_vectorized_sampler_n_envs(self, cpus, n_envs, expected_n_envs): with LocalTFRunner(snapshot_config, sess=self.sess, max_cpus=cpus) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env, sampler_args=dict(n_envs=n_envs)) assert isinstance(runner._sampler, OnPolicyVectorizedSampler) assert runner._sampler._n_envs == expected_n_envs env.close()
class TestDiscretePolicies(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(DummyDiscreteEnv()) def teardown_method(self): self.env.close() super().teardown_method() def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs) def test_categorical_lstm_policy(self): categorical_lstm_policy = CategoricalLSTMPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_lstm_policy.reset() obs = self.env.observation_space.high assert categorical_lstm_policy.get_action(obs) def test_categorial_mlp_policy(self): categorical_mlp_policy = CategoricalMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) obs = self.env.observation_space.high assert categorical_mlp_policy.get_action(obs)
def test_tnpg_inverted_pendulum(self): """Test TNPG with InvertedPendulum-v2 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(reg_coeff=5e-1)) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 15 env.close()
def test_categorical_policies(self, policy_cls): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v0'))) policy = policy_cls(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=4000) env.close()
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CMAES(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=1, batch_size=1000) # No assertion on return because CMAES is not stable. env.close()
class TestPPO(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec) self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) # large marker removed to balance test jobs # @pytest.mark.large def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10)) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35 # large marker removed to balance test jobs # @pytest.mark.large def test_ppo_with_maximum_entropy(self): """Test PPO with maxium entropy method.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35 # large marker removed to balance test jobs # @pytest.mark.large def test_ppo_with_neg_log_likeli_entropy_estimation_and_max(self): """ Test PPO with negative log likelihood entropy estimation and max entropy method. """ with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), stop_entropy_gradient=True, use_neg_logli_entropy=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35 # large marker removed to balance test jobs # @pytest.mark.large def test_ppo_with_neg_log_likeli_entropy_estimation_and_regularized(self): """ Test PPO with negative log likelihood entropy estimation and regularized entropy method. """ with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), stop_entropy_gradient=True, use_neg_logli_entropy=True, entropy_method='regularized', policy_ent_coeff=0.0, center_adv=True) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35 # large marker removed to balance test jobs # @pytest.mark.large def test_ppo_with_regularized_entropy(self): """Test PPO with regularized entropy method.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), stop_entropy_gradient=False, entropy_method='regularized', policy_ent_coeff=0.02, center_adv=True, flatten_input=False) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35 def test_ppo_pendulum_flatten_input(self): """Test PPO with CartPole to test observation flattening.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv( normalize(ReshapeObservation(gym.make('CartPole-v1'), (2, 2)))) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 def teardown_method(self): self.env.close() super().teardown_method()
class TestNPO(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) @pytest.mark.large def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = NPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20 def test_npo_with_unknown_pg_loss(self): """Test NPO with unkown pg loss.""" with pytest.raises(ValueError, match='Invalid pg_loss'): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, pg_loss='random pg_loss', ) def test_npo_with_invalid_entropy_method(self): """Test NPO with invalid entropy method.""" with pytest.raises(ValueError, match='Invalid entropy_method'): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method=None, ) def test_npo_with_max_entropy_and_center_adv(self): """Test NPO with max entropy and center_adv.""" with pytest.raises(ValueError): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method='max', center_adv=True, ) def test_npo_with_max_entropy_and_no_stop_entropy_gradient(self): """Test NPO with max entropy and false stop_entropy_gradient.""" with pytest.raises(ValueError): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method='max', stop_entropy_gradient=False, ) def test_npo_with_invalid_no_entropy_configuration(self): """Test NPO with invalid no entropy configuration.""" with pytest.raises(ValueError): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method='no_entropy', policy_ent_coeff=0.02, ) def teardown_method(self): self.env.close() super().teardown_method()
class TestTRPO(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) @pytest.mark.large def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = TRPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 50 def test_trpo_unknown_kl_constraint(self): """Test TRPO with unkown KL constraints.""" with pytest.raises(ValueError, match='Invalid kl_constraint'): TRPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, kl_constraint='random kl_constraint', ) @pytest.mark.large def test_trpo_soft_kl_constraint(self): """Test TRPO with unkown KL constraints.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = TRPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, kl_constraint='soft') runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 45 @pytest.mark.large def test_trpo_lstm_cartpole(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) snapshotter.snapshot_dir = './' runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close() @pytest.mark.large def test_trpo_gru_cartpole(self): deterministic.set_seed(2) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close() def teardown_method(self): self.env.close() super().teardown_method()
class TestSampler: """ Uses mock policy for 4x4 gridworldenv '4x4': [ 'SFFF', 'FHFH', 'FFFH', 'HFFG' ] 0: left 1: down 2: right 3: up -1: no move 'S' : starting point 'F' or '.': free space 'W' or 'x': wall 'H' or 'o': hole (terminates episode) 'G' : goal [2,2,1,0,3,1,1,1,2,2,1,1,1,2,2,1] """ def setup_method(self): self.env = TfEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock(env_spec=self.env.spec, policy=self.policy, max_path_length=16) def teardown_method(self): self.env.close() def test_local_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = LocalSampler.from_worker_factory(workers, self.policy, self.env) sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker()