def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) episodes_per_task = 5 max_episode_length = self.env.spec.max_episode_length task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, n_test_tasks=1, n_test_episodes=10) trainer = Trainer(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, task_sampler=self.task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5
def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') tasks = MetaWorldTaskSampler(ml1, 'train') env = tasks.sample(1)[0]() test_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, n_test_tasks=1, n_exploration_eps=rollouts_per_task) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=tasks, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=rollouts_per_task * env.spec.max_episode_length)
def test_maml_trpo_dummy_named_env(): """Test with dummy environment that has env_name.""" env = normalize(GymEnv(DummyMultiTaskBoxEnv(), max_episode_length=100), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32)) task_sampler = SetTaskSampler( DummyMultiTaskBoxEnv, wrapper=lambda env, _: normalize(GymEnv(env, max_episode_length=100), expected_action_scale=10.)) episodes_per_task = 2 max_episode_length = env.spec.max_episode_length trainer = Trainer(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, task_sampler=task_sampler, value_function=value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=2, batch_size=episodes_per_task * max_episode_length)
def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) task_sampler = SetTaskSampler(lambda: normalize( GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32)) self.algo = MAMLPPO(env=self.env, policy=self.policy, sampler=None, task_sampler=task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml45 = metaworld.ML45() # pylint: disable=missing-return-doc,missing-return-type-doc def wrap(env, _): return normalize(env, expected_action_scale=10.0) train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap) test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=wrap) env = train_task_sampler.sample(45)[0]() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, task_sampler=train_task_sampler, policy=policy, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def test_maml_trpo_pendulum(): """Test PPO with Pendulum environment.""" episodes_per_task = 5 max_episode_length = 100 env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32)) task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) trainer = Trainer(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=task_sampler, value_function=value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=5, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5 env.close()
def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) self.task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv(env, max_episode_length=100), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32))
def rl2_ppo_metaworld_ml1_push(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train RL2 PPO with ML1 environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') task_sampler = MetaWorldTaskSampler(ml1, 'train', lambda env, _: RL2Env(env)) env = task_sampler.sample(1)[0]() test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test'), wrapper=lambda env, _: RL2Env(env)) env_spec = env.spec with TFTrainer(snapshot_config=ctxt) as trainer: policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=task_sampler, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_optimization_epochs=10), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, meta_evaluator=meta_evaluator, episodes_per_trial=episode_per_task) trainer.setup(algo, task_sampler.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * env_spec.max_episode_length * meta_batch_size)