def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = NPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 20
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = NPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20 env.close()
def test_npo_with_invalid_entropy_method(self): """Test NPO with invalid entropy method.""" with pytest.raises(ValueError, match='Invalid entropy_method'): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method=None, )
def test_npo_with_unknown_pg_loss(self): """Test NPO with unkown pg loss.""" with pytest.raises(ValueError, match='Invalid pg_loss'): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, pg_loss='random pg_loss', )
def test_npo_with_invalid_no_entropy_configuration(self): """Test NPO with invalid no entropy configuration.""" with pytest.raises(ValueError): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method='no_entropy', policy_ent_coeff=0.02, )
def test_npo_with_max_entropy_and_no_stop_entropy_gradient(self): """Test NPO with max entropy and false stop_entropy_gradient.""" with pytest.raises(ValueError): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method='max', stop_entropy_gradient=False, )
def test_npo_with_max_entropy_and_center_adv(self): """Test NPO with max entropy and center_adv.""" with pytest.raises(ValueError): NPO( env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, entropy_method='max', center_adv=True, )
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = NPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = NPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20
def tf_gym_music(ctxt=None, seed=1): """Train Policy Gradient LSTM with Music-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. created by @wrap_experiment seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv(MusicEnv(monitor = HeartMonitor('DC:39:39:66:26:1F')),max_episode_length = 35) policy = GaussianLSTMPolicy(name='policy', env_spec=env.spec, hidden_dim= 32) baseline = GaussianMLPBaseline( env_spec = env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=False, n_workers = 1, ) algo = NPO(env_spec = env.spec, policy = policy, baseline = baseline, sampler = sampler, ) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=1,store_episodes = True)
def test_npo_unknown_pg_loss(self): """Test NPO with unkown policy gradient loss.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) with self.assertRaises(NotImplementedError) as context: NPO( env=env, policy=policy, baseline=baseline, pg_loss="random pg_loss", ) assert "Unknown PGLoss" in str(context.exception)
seed = 2021 set_seed(seed) trainer = TFTrainer(snapshot_config=ctxt) env = GymEnv(MusicEnv(monitor=HeartMonitor('DC:39:39:66:26:1F')), max_episode_length=25) policy = GaussianLSTMPolicy(name='policy', env_spec=env.spec, hidden_dim=32) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=False, n_workers=1, ) algo = NPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, ) trainer.setup(algo, env) print("trainer.train(n_epochs=120, batch_size=1)")