def test_trpo_unknown_kl_constraint(self): """Test TRPO with unkown KL constraints.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) with self.assertRaises(NotImplementedError) as context: TRPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, kl_constraint="random kl_constraint", ) assert "Unknown KLConstraint" in str(context.exception) env.close()
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = NPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 20
def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" with LocalRunner(self.sess) as runner: logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = TRPO(env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 50 env.close()
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalRunner(self.sess) as runner: logger.reset() env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, max_kl_step=1e6) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40 env.close()
def test_gaussian_policies(self, policy_cls): with LocalRunner(self.sess) as runner: logger.reset() env = TfEnv(normalize(gym.make("Pendulum-v0"))) policy = policy_cls(name="policy", env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, step_size=0.01, plot=True, optimizer=ConjugateGradientOptimizer, optimizer_args=dict( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=4000) env.close()
def test_ddpg_pendulum(self): """Test PPO with Pendulum environment.""" logger.reset() env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=1e-2, n_epochs=10, n_epoch_cycles=20, max_path_length=100, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 60
def test_ppo_pendulum_recurrent(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalRunner() as runner: logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 40 env.close()
def test_erwr_cartpole(self): """Test ERWR with Cartpole environment.""" logger.reset() env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=10, discount=0.99) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 100
def test_npo_unknown_pg_loss(self): """Test NPO with unkown policy gradient loss.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) with self.assertRaises(NotImplementedError) as context: NPO( env=env, policy=policy, baseline=baseline, pg_loss="random pg_loss", ) assert "Unknown PGLoss" in str(context.exception)
def test_tnpg_cartpole(self): """Test TNPG with Cartpole environment.""" logger.reset() env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=10, discount=0.99, optimizer_args=dict(reg_coeff=5e-2)) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40
def test_gaussian_policies(self, policy_cls): logger.reset() env = TfEnv(normalize(CartpoleEnv())) policy = policy_cls(name="policy", env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1, discount=0.99, step_size=0.01, plot=True, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), ) algo.train(sess=self.sess)
def setUp(self): self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.sess.__enter__() logger.reset() deterministic.set_seed(1)
def setUp(self): self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) self.sess.__enter__() logger.reset()