def test_rl2_trpo_pendulum(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = RL2TRPO( rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, max_path_length=self.max_path_length * self.episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker) last_avg_ret = runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size) assert last_avg_ret > -40
def rl2_trpo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train TRPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, episodes_per_trial=episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def test_rl2_trpo_pendulum_invalid_kl_constraint(self): with LocalTFRunner(snapshot_config, sess=self.sess): with pytest.raises(ValueError): RL2TRPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, kl_constraint='xyz', episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01)
def test_ppo_pendulum_default_optimizer2(self): with LocalTFRunner(snapshot_config, sess=self.sess): algo = RL2TRPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, kl_constraint='soft', episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01) assert isinstance(algo._inner_algo._optimizer, PenaltyLbfgsOptimizer)
def test_rl2_trpo_pendulum_default_optimizer(self): with TFTrainer(snapshot_config, sess=self.sess): algo = RL2TRPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, kl_constraint='hard', episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01) assert isinstance(algo._inner_algo._optimizer, ConjugateGradientOptimizer)
def rl2_trpo_halfcheetah(ctxt=None, seed=1): """Train TRPO with HalfCheetah environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def test_rl2_trpo_pendulum(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = RL2TRPO( meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, sampler=self.sampler, episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, self.tasks.sample(self.meta_batch_size)) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40