def _prepare_meta_env(env): if ML: if env_ind == 2: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('push-v1'), random_init=False)) elif env_ind == 3: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('reach-v1'), random_init=False)) elif env_ind == 4: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('pick-place-v1'), random_init=False)) else: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(env())) return task_samplers.sample(1)[0](), task_samplers
def rl2_ppo_metaworld_ml1_push(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=mwb.ML1.get_train_tasks('push-v1'))) env_spec = RL2Env(env=mwb.ML1.get_train_tasks('push-v1')).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def test_set_task_task_sampler_half_cheetah_vel_env(): tasks = task_sampler.SetTaskSampler(HalfCheetahVelEnv) assert tasks.n_tasks is None updates = tasks.sample(10) envs = [update() for update in updates] action = envs[0].action_space.sample() rewards = [env.step(action)[1] for env in envs] assert np.var(rewards) > 0 env = envs[0] env.close = unittest.mock.MagicMock(name='env.close') updates[-1](env) env.close.assert_not_called()
def setup_method(self): super().setup_method() self.max_path_length = 100 self.meta_batch_size = 10 self.episode_per_task = 4 self.tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=normalize(HalfCheetahDirEnv()))) self.env_spec = RL2Env(env=normalize(HalfCheetahDirEnv())).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
def rl2_trpo_halfcheetah(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train TRPO with HalfCheetah environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def test_set_task_task_sampler_ml10(): # pylint: disable=import-outside-toplevel from metaworld.benchmarks import ML10 tasks = task_sampler.SetTaskSampler(ML10.get_train_tasks) assert tasks.n_tasks == 10 updates = tasks.sample(3) envs = [update() for update in updates] action = envs[0].action_space.sample() rewards = [env.step(action)[1] for env in envs] assert np.var(rewards) > 0 env = envs[0] env.close = unittest.mock.MagicMock(name='env.close') updates[-1](env) env.close.assert_not_called()
def rl2_ppo_halfcheetah(ctxt=None, seed=1): """Train PPO with HalfCheetah environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 # ---- For ML1-push from metaworld.benchmarks import ML1 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=ML1.get_train_tasks('push-v1'))) # ---- For HalfCheetahVel # tasks = task_sampler.SetTaskSampler(lambda: RL2Env( # env=HalfCheetahVelEnv())) env_spec = tasks.sample(1)[0]().spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) inner_algo = RL2PPO( env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) algo = RL2(policy=policy, inner_algo=inner_algo, max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)