def vpg_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = MetaRLEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = VPG(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-1), outer_lr=1e-3, max_path_length=100, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1): inner_algo = VPG(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates)
def test_invalid_entropy_config(self, algo_param, error, msg): """Test VPG with invalid entropy config.""" self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)