def test_rl2_ppo_pendulum(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = RL2PPO(rl2_max_episode_length=self.max_episode_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_episode_length=self.max_episode_length * self.episode_per_task) runner.setup( algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=self.episode_per_task)) last_avg_ret = runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def test_rl2_ppo_pendulum_adapted_policy(self): with LocalTFRunner(snapshot_config, sess=self.sess): algo = RL2PPO(rl2_max_episode_length=self.max_episode_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_episode_length=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_episode_length=self.max_episode_length * self.episode_per_task) exploration_policy = algo.get_exploration_policy() adapted_policy = algo.adapt_policy(exploration_policy, []) (params, hidden) = adapted_policy.get_param_values() expected_new_params = np.zeros_like(params) expected_hidden = np.zeros_like(hidden) adapted_policy.set_param_values( (expected_new_params, expected_hidden)) (new_params, new_hidden) = adapted_policy.get_param_values() assert np.array_equal(expected_new_params, new_params) assert np.array_equal(expected_hidden, new_hidden)
def test_rl2_ppo_pendulum_wrong_worker(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: with pytest.raises(ValueError): algo = RL2PPO(rl2_max_episode_length=self.max_episode_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_episode_length=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_episode_length=self.max_episode_length * self.episode_per_task) runner.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size) runner.train(n_epochs=10, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size)
def test_rl2_ppo_pendulum_exploration_policy(self): with LocalTFRunner(snapshot_config, sess=self.sess): algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task) exploration_policy = algo.get_exploration_policy() params = exploration_policy.get_param_values() new_params = np.zeros_like(params) exploration_policy.set_param_values(new_params) assert np.array_equal(new_params, exploration_policy.get_param_values())
def rl2_ppo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, episodes_per_trial=episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def test_rl2_ppo_pendulum_meta_test(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: meta_evaluator = MetaEvaluator(test_task_sampler=self.tasks, n_exploration_eps=10, n_test_episodes=10, n_test_tasks=1) algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, sampler=self.sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) trainer.setup(algo, self.tasks.sample(self.meta_batch_size)) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def rl2_ppo_metaworld_ml1_push(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML1 environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_episode_length = 150 inner_max_episode_length = max_episode_length * episode_per_task tasks = task_sampler.SetTaskSampler( lambda: RL2Env(GymEnv(mwb.ML1.get_train_tasks('push-v1')))) env_spec = RL2Env( GymEnv(mwb.ML1.get_train_tasks('push-v1'), max_episode_length=inner_max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def rl2_ppo_halfcheetah(ctxt=None, seed=1): """Train PPO with HalfCheetah environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, pg_loss='surrogate_clip', optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def test_rl2_ppo_ml10(self): # pylint: disable=import-outside-toplevel from metaworld.benchmarks import ML10 ML_train_envs = [ RL2Env(ML10.from_task(task_name)) for task_name in ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ML_train_envs) tasks.grow_pool(self.meta_batch_size) env_spec = ML_train_envs[0].spec policy = GaussianGRUPolicy(env_spec=env_spec, hidden_dim=64, state_include_action=False, name='policy') baseline = LinearFeatureBaseline(env_spec=env_spec) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task) runner.setup( algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=self.episode_per_task)) runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size)
def test_rl2_ppo_pendulum_meta_test(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: meta_evaluator = MetaEvaluator( test_task_sampler=self.tasks, n_exploration_traj=10, n_test_rollouts=10, max_path_length=self.max_path_length, n_test_tasks=1) algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, pg_loss='surrogate_clip', optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker) last_avg_ret = runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size) assert last_avg_ret > -40
def rl2_ppo_metaworld_ml1_push(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train RL2 PPO with ML1 environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') task_sampler = MetaWorldTaskSampler(ml1, 'train', lambda env, _: RL2Env(env)) env = task_sampler.sample(1)[0]() test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test'), wrapper=lambda env, _: RL2Env(env)) env_spec = env.spec with TFTrainer(snapshot_config=ctxt) as trainer: policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=task_sampler, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_optimization_epochs=10), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, meta_evaluator=meta_evaluator, episodes_per_trial=episode_per_task) trainer.setup(algo, task_sampler.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * env_spec.max_episode_length * meta_batch_size)
def rl2_ppo_metaworld_ml45(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML45 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: ml45_train_tasks = mwb.ML45.get_train_tasks() ml45_train_envs = [ RL2Env(mwb.ML45.from_task(task_name)) for task_name in ml45_train_tasks.all_task_names ] tasks = task_sampler.EnvPoolSampler(ml45_train_envs) tasks.grow_pool(meta_batch_size) env_spec = ml45_train_envs[0].spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def rl2_ppo_halfcheetah_meta_test(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Perform meta-testing on RL2PPO with HalfCheetah environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=tasks, n_exploration_traj=10, n_test_rollouts=10, max_path_length=max_path_length, n_test_tasks=5) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def rl2_ppo_metaworld_ml10_meta_test(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML10 environment with meta-test. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_episode_length = 150 inner_max_episode_length = max_episode_length * episode_per_task ml10_train_envs = [ RL2Env(GymEnv(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ml10_train_envs) tasks.grow_pool(meta_batch_size) ml10_test_envs = [ RL2Env( GymEnv(mwb.ML10.from_task(task_name), max_episode_length=inner_max_episode_length)) for task_name in mwb.ML10.get_test_tasks().all_task_names ] test_tasks = task_sampler.EnvPoolSampler(ml10_test_envs) env_spec = ml10_train_envs[0].spec max_episode_length = env_spec.max_episode_length policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=test_tasks, n_exploration_eps=10, n_test_episodes=10, max_episode_length=max_episode_length, n_test_tasks=5) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)