Esempio n. 1
0
def test_env_pool_sampler():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    from metaworld.envs.mujoco.env_dict import MEDIUM_MODE_ARGS_KWARGS
    from metaworld.envs.mujoco.env_dict import MEDIUM_MODE_CLS_DICT
    ML10_ARGS = MEDIUM_MODE_ARGS_KWARGS
    ML10_ENVS = MEDIUM_MODE_CLS_DICT

    ML10_train_envs = [
        env(*ML10_ARGS['train'][task]['args'],
            **ML10_ARGS['train'][task]['kwargs'])
        for (task, env) in ML10_ENVS['train'].items()
    ]
    tasks = task_sampler.EnvPoolSampler(ML10_train_envs)
    assert tasks.n_tasks == 10
    updates = tasks.sample(10)
    for env in ML10_train_envs:
        assert any(env is update() for update in updates)
    with pytest.raises(ValueError):
        tasks.sample(10, with_replacement=True)
    with pytest.raises(ValueError):
        tasks.sample(11)
    tasks.grow_pool(20)
    tasks.sample(20)
Esempio n. 2
0
def test_env_pool_sampler():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    envs = [HalfCheetahVelEnv() for _ in range(5)]
    tasks = task_sampler.EnvPoolSampler(envs)
    assert tasks.n_tasks == 5
    updates = tasks.sample(5)
    for env in envs:
        assert any(env is update() for update in updates)
    with pytest.raises(ValueError):
        tasks.sample(5, with_replacement=True)
    with pytest.raises(ValueError):
        tasks.sample(6)
    tasks.grow_pool(10)
    tasks.sample(10)
Esempio n. 3
0
    def test_rl2_ppo_ml10(self):
        # pylint: disable=import-outside-toplevel
        from metaworld.benchmarks import ML10
        ML_train_envs = [
            RL2Env(ML10.from_task(task_name))
            for task_name in ML10.get_train_tasks().all_task_names
        ]
        tasks = task_sampler.EnvPoolSampler(ML_train_envs)
        tasks.grow_pool(self.meta_batch_size)

        env_spec = ML_train_envs[0].spec
        policy = GaussianGRUPolicy(env_spec=env_spec,
                                   hidden_dim=64,
                                   state_include_action=False,
                                   name='policy')
        baseline = LinearFeatureBaseline(env_spec=env_spec)
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            algo = RL2PPO(rl2_max_path_length=self.max_path_length,
                          meta_batch_size=self.meta_batch_size,
                          task_sampler=tasks,
                          env_spec=env_spec,
                          policy=policy,
                          baseline=baseline,
                          discount=0.99,
                          gae_lambda=0.95,
                          lr_clip_range=0.2,
                          stop_entropy_gradient=True,
                          entropy_method='max',
                          policy_ent_coeff=0.02,
                          center_adv=False,
                          max_path_length=self.max_path_length *
                          self.episode_per_task)

            runner.setup(
                algo,
                self.tasks.sample(self.meta_batch_size),
                sampler_cls=LocalSampler,
                n_workers=self.meta_batch_size,
                worker_class=RL2Worker,
                worker_args=dict(n_paths_per_trial=self.episode_per_task))

            runner.train(n_epochs=1,
                         batch_size=self.episode_per_task *
                         self.max_path_length * self.meta_batch_size)
Esempio n. 4
0
def test_env_pool_sampler():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    from metaworld.benchmarks import ML10
    train_tasks = ML10.get_train_tasks().all_task_names
    ML10_train_envs = [
        ML10.from_task(train_task) for train_task in train_tasks
    ]
    tasks = task_sampler.EnvPoolSampler(ML10_train_envs)
    assert tasks.n_tasks == 10
    updates = tasks.sample(10)
    for env in ML10_train_envs:
        assert any(env is update() for update in updates)
    with pytest.raises(ValueError):
        tasks.sample(10, with_replacement=True)
    with pytest.raises(ValueError):
        tasks.sample(11)
    tasks.grow_pool(20)
    tasks.sample(20)
Esempio n. 5
0
def rl2_ppo_metaworld_ml45(ctxt, seed, max_path_length, meta_batch_size,
                           n_epochs, episode_per_task):
    """Train PPO with ML45 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        max_path_length (int): Maximum length of a single rollout.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        ml45_train_tasks = mwb.ML45.get_train_tasks()
        ml45_train_envs = [
            RL2Env(mwb.ML45.from_task(task_name))
            for task_name in ml45_train_tasks.all_task_names
        ]
        tasks = task_sampler.EnvPoolSampler(ml45_train_envs)
        tasks.grow_pool(meta_batch_size)

        env_spec = ml45_train_envs[0].spec

        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2PPO(rl2_max_path_length=max_path_length,
                      meta_batch_size=meta_batch_size,
                      task_sampler=tasks,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(
                          batch_size=32,
                          max_epochs=10,
                      ),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      max_path_length=max_path_length * episode_per_task)

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker,
                     worker_args=dict(n_paths_per_trial=episode_per_task))

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)
def rl2_ppo_metaworld_ml10_meta_test(ctxt, seed, meta_batch_size, n_epochs,
                                     episode_per_task):
    """Train PPO with ML10 environment with meta-test.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~LocalRunner` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        max_episode_length = 150
        inner_max_episode_length = max_episode_length * episode_per_task
        ml10_train_envs = [
            RL2Env(GymEnv(mwb.ML10.from_task(task_name)))
            for task_name in mwb.ML10.get_train_tasks().all_task_names
        ]
        tasks = task_sampler.EnvPoolSampler(ml10_train_envs)
        tasks.grow_pool(meta_batch_size)

        ml10_test_envs = [
            RL2Env(
                GymEnv(mwb.ML10.from_task(task_name),
                       max_episode_length=inner_max_episode_length))
            for task_name in mwb.ML10.get_test_tasks().all_task_names
        ]
        test_tasks = task_sampler.EnvPoolSampler(ml10_test_envs)

        env_spec = ml10_train_envs[0].spec
        max_episode_length = env_spec.max_episode_length

        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        meta_evaluator = MetaEvaluator(test_task_sampler=test_tasks,
                                       n_exploration_eps=10,
                                       n_test_episodes=10,
                                       max_episode_length=max_episode_length,
                                       n_test_tasks=5)

        algo = RL2PPO(meta_batch_size=meta_batch_size,
                      task_sampler=tasks,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(
                          batch_size=32,
                          max_optimization_epochs=10,
                      ),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      episodes_per_trial=episode_per_task,
                      meta_evaluator=meta_evaluator,
                      n_epochs_per_eval=10)

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker,
                     worker_args=dict(n_episodes_per_trial=episode_per_task))

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_episode_length *
                     meta_batch_size)