Beispiel #1
0
def maml_trpo_metaworld_ml10(ctxt, seed, epochs, rollouts_per_task,
                             meta_batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        rollouts_per_task (int): Number of rollouts per epoch per task
            for training.
        meta_batch_size (int): Number of tasks sampled per batch.

    """
    set_seed(seed)
    env = GarageEnv(
        normalize(mwb.ML10.get_train_tasks(), expected_action_scale=10.))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    max_episode_length = 100

    test_task_names = mwb.ML10.get_test_tasks().all_task_names
    test_tasks = [
        GarageEnv(
            normalize(mwb.ML10.from_task(task), expected_action_scale=10.))
        for task in test_task_names
    ]
    test_sampler = EnvPoolSampler(test_tasks)

    meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler,
                                   max_episode_length=max_episode_length,
                                   n_test_tasks=len(test_task_names))

    runner = LocalRunner(ctxt)
    algo = MAMLTRPO(env=env,
                    policy=policy,
                    value_function=value_function,
                    max_episode_length=max_episode_length,
                    meta_batch_size=meta_batch_size,
                    discount=0.99,
                    gae_lambda=1.,
                    inner_lr=0.1,
                    num_grad_updates=1,
                    meta_evaluator=meta_evaluator)

    runner.setup(algo, env)
    runner.train(n_epochs=epochs,
                 batch_size=rollouts_per_task * max_episode_length)
Beispiel #2
0
def test_init_with_env_updates(policy, envs):
    task_sampler = EnvPoolSampler(envs)
    envs = task_sampler.sample(N_TRAJ)
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_TRAJ,
                                 max_path_length=MAX_PATH_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=VecWorker,
                                worker_args=dict(n_envs=N_TRAJ),
                                max_path_length=MAX_PATH_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, [policy],
                                                   [envs])
    n_samples = 100
    true_trajs = true_sampler.obtain_samples(0, n_samples, None)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None)

    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
def pearl_metaworld_ml10(ctxt=None,
                         seed=1,
                         num_epochs=1000,
                         num_train_tasks=10,
                         num_test_tasks=5,
                         latent_size=7,
                         encoder_hidden_size=200,
                         net_size=300,
                         meta_batch_size=16,
                         num_steps_per_epoch=4000,
                         num_initial_steps=4000,
                         num_tasks_sample=15,
                         num_steps_prior=750,
                         num_extra_rl_steps_posterior=750,
                         batch_size=256,
                         embedding_batch_size=64,
                         embedding_mini_batch_size=64,
                         max_path_length=150,
                         reward_scale=10.,
                         use_gpu=False):
    """Train PEARL with ML10 environments.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        num_epochs (int): Number of training epochs.
        num_train_tasks (int): Number of tasks for training.
        num_test_tasks (int): Number of tasks for testing.
        latent_size (int): Size of latent context vector.
        encoder_hidden_size (int): Output dimension of dense layer of the
            context encoder.
        net_size (int): Output dimension of a dense layer of Q-function and
            value function.
        meta_batch_size (int): Meta batch size.
        num_steps_per_epoch (int): Number of iterations per epoch.
        num_initial_steps (int): Number of transitions obtained per task before
            training.
        num_tasks_sample (int): Number of random tasks to obtain data for each
            iteration.
        num_steps_prior (int): Number of transitions to obtain per task with
            z ~ prior.
        num_extra_rl_steps_posterior (int): Number of additional transitions
            to obtain per task with z ~ posterior that are only used to train
            the policy and NOT the encoder.
        batch_size (int): Number of transitions in RL batch.
        embedding_batch_size (int): Number of transitions in context batch.
        embedding_mini_batch_size (int): Number of transitions in mini context
            batch; should be same as embedding_batch_size for non-recurrent
            encoder.
        max_path_length (int): Maximum path length.
        reward_scale (int): Reward scale.
        use_gpu (bool): Whether or not to use GPU for training.

    """
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks
    ML_train_envs = [
        GarageEnv(normalize(mwb.ML10.from_task(task_name)))
        for task_name in mwb.ML10.get_train_tasks().all_task_names
    ]

    ML_test_envs = [
        GarageEnv(normalize(mwb.ML10.from_task(task_name)))
        for task_name in mwb.ML10.get_test_tasks().all_task_names
    ]

    env_sampler = EnvPoolSampler(ML_train_envs)
    env_sampler.grow_pool(num_train_tasks)
    env = env_sampler.sample(num_train_tasks)
    test_env_sampler = EnvPoolSampler(ML_test_envs)
    test_env_sampler.grow_pool(num_test_tasks)

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    runner.train(n_epochs=num_epochs, batch_size=batch_size)
Beispiel #4
0
def diayn_pearl_half_cheeth(
        ctxt=None,
        seed=1,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        reward_scale=param_reward_scale,
        use_gpu=param_use_gpu):
    if task_proposer is None:
        raise ValueError("Task proposer is empty")

    assert num_train_tasks is skills_num

    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks

    ML_train_envs = [
        DiaynEnvWrapper(task_proposer, skills_num, task_name,
                        normalize(HalfCheetahVelEnv()))
        for task_name in range(skills_num)
    ]
    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(num_train_tasks)

    # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent)
    #               for train_env in ML_train_envs]

    # ML_test_envs = [
    #     GarageEnv(normalize(
    #         DiaynEnvWrapper(env, task_proposer, skills_num, task_name)))
    #     for task_name in random.sample(range(skills_num), test_tasks_num)
    # ]

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns
def meta_kant_cheetah_vel(
        ctxt=None,
        seed=seed,
        num_skills=skills_num,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        is_encoder_recurrent=False,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        num_skills_sample=param_num_skills_sample,
        num_skills_reason_steps=param_num_skills_reason_steps,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        skills_reason_reward_scale=param_skills_reason_reward_scale,
        tasks_adapt_reward_scale=param_tasks_adapt_reward_scale,
        use_gpu=param_use_gpu):
    assert num_train_tasks is skills_num

    set_seed(seed)

    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)

    ML_train_envs = [
        DiaynEnvWrapper(task_proposer, skills_num, task_name,
                        normalize(HalfCheetahVelEnv()))
        for task_name in range(skills_num)
    ]

    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(num_train_tasks)

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    qf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, "qf")

    qf = ContinuousMLPQFunction(env_spec=qf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    controller_policy_env = MetaKant.get_env_spec(env[0](),
                                                  latent_size,
                                                  module="controller_policy",
                                                  num_skills=num_skills)

    controller_policy = CategoricalMLPPolicy(
        env_spec=controller_policy_env,
        hidden_sizes=[net_size, net_size],
        hidden_nonlinearity=functional.relu)

    metakant = MetaKant(
        env=env,
        skill_env=skill_env,
        controller_policy=controller_policy,
        skill_actor=skill_actor,
        qf=qf,
        vf=vf,
        num_skills=num_skills,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        sampler_class=LocalSkillSampler,
        is_encoder_recurrent=is_encoder_recurrent,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_per_epoch=num_steps_per_epoch,
        num_steps_prior=num_steps_prior,  # num_steps_posterior
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        num_skills_reason_steps=num_skills_reason_steps,
        num_skills_sample=num_skills_sample,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        skills_reason_reward_scale=skills_reason_reward_scale,
        tasks_adapt_reward_scale=tasks_adapt_reward_scale)

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        metakant.to()

    worker_args = dict(num_skills=num_skills,
                       skill_actor_class=type(skill_actor),
                       controller_class=OpenContextConditionedControllerPolicy,
                       deterministic=False,
                       accum_context=True)

    runner.setup(algo=metakant,
                 env=env[0](),
                 sampler_cls=LocalSkillSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=KantWorker,
                 worker_args=worker_args)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns