Esempio n. 1
0
def vpg_pendulum(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    env = MetaRLEnv(env_name='InvertedDoublePendulum-v2')

    runner = LocalRunner(ctxt)

    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[64, 64],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    algo = VPG(env_spec=env.spec,
               policy=policy,
               value_function=value_function,
               max_path_length=100,
               discount=0.99,
               center_adv=False)

    runner.setup(algo, env)
    runner.train(n_epochs=100, batch_size=10000)
Esempio n. 2
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters

    """
    env = TfEnv(env_name='InvertedDoublePendulum-v2')

    runner = LocalRunner(snapshot_config)

    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[64, 64],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(env_spec=env.spec,
               policy=policy,
               baseline=baseline,
               max_path_length=100,
               discount=0.99,
               center_adv=False)

    runner.setup(algo, env)
    runner.train(n_epochs=100, batch_size=10000)
Esempio n. 3
0
    def test_vpg_regularized(self):
        """Test VPG with entropy_regularized."""
        self._params['entropy_method'] = 'regularized'

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0
Esempio n. 4
0
    def test_vpg_no_entropy(self):
        """Test VPG with no_entropy."""
        self._params['positive_adv'] = True
        self._params['use_softplus_entropy'] = True

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0
Esempio n. 5
0
    def test_vpg_max(self):
        """Test VPG with maximum entropy."""
        self._params['center_adv'] = False
        self._params['stop_entropy_gradient'] = True
        self._params['entropy_method'] = 'max'

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0
Esempio n. 6
0
    def __init__(self,
                 env,
                 policy,
                 value_function,
                 inner_lr=_Default(1e-2),
                 outer_lr=1e-3,
                 max_kl_step=0.01,
                 max_path_length=500,
                 discount=0.99,
                 gae_lambda=1,
                 center_adv=True,
                 positive_adv=False,
                 policy_ent_coeff=0.0,
                 use_softplus_entropy=False,
                 stop_entropy_gradient=False,
                 entropy_method='no_entropy',
                 meta_batch_size=40,
                 num_grad_updates=1,
                 meta_evaluator=None,
                 evaluate_every_n_epochs=1):

        policy_optimizer = OptimizerWrapper(
            (torch.optim.Adam, dict(lr=inner_lr)), policy)
        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
                                        value_function)

        inner_algo = VPG(env.spec,
                         policy,
                         value_function,
                         policy_optimizer=policy_optimizer,
                         vf_optimizer=vf_optimizer,
                         max_path_length=max_path_length,
                         num_train_per_epoch=1,
                         discount=discount,
                         gae_lambda=gae_lambda,
                         center_adv=center_adv,
                         positive_adv=positive_adv,
                         policy_ent_coeff=policy_ent_coeff,
                         use_softplus_entropy=use_softplus_entropy,
                         stop_entropy_gradient=stop_entropy_gradient,
                         entropy_method=entropy_method)

        meta_optimizer = (ConjugateGradientOptimizer,
                          dict(max_constraint_value=max_kl_step))

        super().__init__(inner_algo=inner_algo,
                         env=env,
                         policy=policy,
                         meta_optimizer=meta_optimizer,
                         meta_batch_size=meta_batch_size,
                         inner_lr=inner_lr,
                         outer_lr=outer_lr,
                         num_grad_updates=num_grad_updates,
                         meta_evaluator=meta_evaluator,
                         evaluate_every_n_epochs=evaluate_every_n_epochs)
Esempio n. 7
0
    def __init__(self,
                 env,
                 policy,
                 baseline,
                 inner_lr=_Default(1e-1),
                 outer_lr=1e-3,
                 max_path_length=100,
                 discount=0.99,
                 gae_lambda=1,
                 center_adv=True,
                 positive_adv=False,
                 policy_ent_coeff=0.0,
                 use_softplus_entropy=False,
                 stop_entropy_gradient=False,
                 entropy_method='no_entropy',
                 meta_batch_size=20,
                 num_grad_updates=1):
        inner_algo = VPG(env.spec,
                         policy,
                         baseline,
                         optimizer=torch.optim.Adam,
                         policy_lr=inner_lr,
                         max_path_length=max_path_length,
                         num_train_per_epoch=1,
                         discount=discount,
                         gae_lambda=gae_lambda,
                         center_adv=center_adv,
                         positive_adv=positive_adv,
                         policy_ent_coeff=policy_ent_coeff,
                         use_softplus_entropy=use_softplus_entropy,
                         stop_entropy_gradient=stop_entropy_gradient,
                         entropy_method=entropy_method)

        super().__init__(inner_algo=inner_algo,
                         env=env,
                         policy=policy,
                         baseline=baseline,
                         meta_optimizer=torch.optim.Adam,
                         meta_batch_size=meta_batch_size,
                         inner_lr=inner_lr,
                         outer_lr=outer_lr,
                         num_grad_updates=num_grad_updates)
Esempio n. 8
0
 def test_invalid_entropy_config(self, algo_param, error, msg):
     """Test VPG with invalid entropy config."""
     self._params.update(algo_param)
     with pytest.raises(error, match=msg):
         VPG(**self._params)