コード例 #1
0
def _prepare_meta_env(env):
    if ML:
        if env_ind == 2:
            task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('push-v1'), random_init=False))
        elif env_ind == 3:
            task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('reach-v1'), random_init=False))
        elif env_ind == 4:
            task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('pick-place-v1'), random_init=False))
    else:
        task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(env()))
    return task_samplers.sample(1)[0](), task_samplers
コード例 #2
0
def rl2_ppo_metaworld_ml1_push(ctxt, seed, max_path_length, meta_batch_size,
                               n_epochs, episode_per_task):
    """Train PPO with ML1 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        max_path_length (int): Maximum length of a single rollout.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        tasks = task_sampler.SetTaskSampler(lambda: RL2Env(
            env=mwb.ML1.get_train_tasks('push-v1')))

        env_spec = RL2Env(env=mwb.ML1.get_train_tasks('push-v1')).spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2PPO(rl2_max_path_length=max_path_length,
                      meta_batch_size=meta_batch_size,
                      task_sampler=tasks,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(
                          batch_size=32,
                          max_epochs=10,
                      ),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      max_path_length=max_path_length * episode_per_task)

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker,
                     worker_args=dict(n_paths_per_trial=episode_per_task))

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)
コード例 #3
0
def test_set_task_task_sampler_half_cheetah_vel_env():
    tasks = task_sampler.SetTaskSampler(HalfCheetahVelEnv)
    assert tasks.n_tasks is None
    updates = tasks.sample(10)
    envs = [update() for update in updates]
    action = envs[0].action_space.sample()
    rewards = [env.step(action)[1] for env in envs]
    assert np.var(rewards) > 0
    env = envs[0]
    env.close = unittest.mock.MagicMock(name='env.close')
    updates[-1](env)
    env.close.assert_not_called()
コード例 #4
0
 def setup_method(self):
     super().setup_method()
     self.max_path_length = 100
     self.meta_batch_size = 10
     self.episode_per_task = 4
     self.tasks = task_sampler.SetTaskSampler(
         lambda: RL2Env(env=normalize(HalfCheetahDirEnv())))
     self.env_spec = RL2Env(env=normalize(HalfCheetahDirEnv())).spec
     self.policy = GaussianGRUPolicy(env_spec=self.env_spec,
                                     hidden_dim=64,
                                     state_include_action=False)
     self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
コード例 #5
0
def rl2_trpo_halfcheetah(ctxt, seed, max_path_length, meta_batch_size,
                         n_epochs, episode_per_task):
    """Train TRPO with HalfCheetah environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        max_path_length (int): Maximum length of a single rollout.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        tasks = task_sampler.SetTaskSampler(
            lambda: RL2Env(env=HalfCheetahVelEnv()))

        env_spec = RL2Env(env=HalfCheetahVelEnv()).spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2TRPO(rl2_max_path_length=max_path_length,
                       meta_batch_size=meta_batch_size,
                       task_sampler=tasks,
                       env_spec=env_spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=max_path_length * episode_per_task,
                       discount=0.99,
                       max_kl_step=0.01,
                       optimizer=ConjugateGradientOptimizer,
                       optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                           base_eps=1e-5)))

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker,
                     worker_args=dict(n_paths_per_trial=episode_per_task))

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)
コード例 #6
0
def test_set_task_task_sampler_ml10():
    # pylint: disable=import-outside-toplevel
    from metaworld.benchmarks import ML10
    tasks = task_sampler.SetTaskSampler(ML10.get_train_tasks)
    assert tasks.n_tasks == 10
    updates = tasks.sample(3)
    envs = [update() for update in updates]
    action = envs[0].action_space.sample()
    rewards = [env.step(action)[1] for env in envs]
    assert np.var(rewards) > 0
    env = envs[0]
    env.close = unittest.mock.MagicMock(name='env.close')
    updates[-1](env)
    env.close.assert_not_called()
コード例 #7
0
def rl2_ppo_halfcheetah(ctxt=None, seed=1):
    """Train PPO with HalfCheetah environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        max_path_length = 100
        meta_batch_size = 10
        n_epochs = 50
        episode_per_task = 4

        # ---- For ML1-push
        from metaworld.benchmarks import ML1        
        tasks = task_sampler.SetTaskSampler(lambda: RL2Env(
            env=ML1.get_train_tasks('push-v1')))

        # ---- For HalfCheetahVel
        # tasks = task_sampler.SetTaskSampler(lambda: RL2Env(
        #     env=HalfCheetahVelEnv()))

        env_spec = tasks.sample(1)[0]().spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        inner_algo = RL2PPO(
            env_spec=env_spec,
            policy=policy,
            baseline=baseline,
            max_path_length=max_path_length * episode_per_task,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
            ),
            stop_entropy_gradient=True,
            entropy_method='max',
            policy_ent_coeff=0.02,
            center_adv=False,
        )

        algo = RL2(policy=policy,
                   inner_algo=inner_algo,
                   max_path_length=max_path_length,
                   meta_batch_size=meta_batch_size,
                   task_sampler=tasks)

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker)

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)