def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task,
                                 meta_batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        rollouts_per_task (int): Number of rollouts per epoch per task
            for training.
        meta_batch_size (int): Number of tasks sampled per batch.

    """
    set_seed(seed)

    ml1 = metaworld.ML1('push-v1')
    tasks = MetaWorldTaskSampler(ml1, 'train')
    env = tasks.sample(1)[0]()
    test_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                  env=MetaWorldSetTaskEnv(ml1, 'test'))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=[32, 32],
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler,
                                   n_test_tasks=1,
                                   n_exploration_eps=rollouts_per_task)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=meta_batch_size)

    trainer = Trainer(ctxt)
    algo = MAMLTRPO(env=env,
                    policy=policy,
                    sampler=sampler,
                    task_sampler=tasks,
                    value_function=value_function,
                    meta_batch_size=meta_batch_size,
                    discount=0.99,
                    gae_lambda=1.,
                    inner_lr=0.1,
                    num_grad_updates=1,
                    meta_evaluator=meta_evaluator)

    trainer.setup(algo, env)
    trainer.train(n_epochs=epochs,
                  batch_size=rollouts_per_task * env.spec.max_episode_length)
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task,
                             meta_batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        episodes_per_task (int): Number of episodes per epoch per task
            for training.
        meta_batch_size (int): Number of tasks sampled per batch.

    """
    set_seed(seed)
    ml45 = metaworld.ML45()

    # pylint: disable=missing-return-doc,missing-return-type-doc
    def wrap(env, _):
        return normalize(env, expected_action_scale=10.0)

    train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap)
    test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None)
    test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                       env=test_env,
                                       wrapper=wrap)
    env = train_task_sampler.sample(45)[0]()

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler)

    trainer = Trainer(ctxt)
    algo = MAMLTRPO(env=env,
                    task_sampler=train_task_sampler,
                    policy=policy,
                    value_function=value_function,
                    meta_batch_size=meta_batch_size,
                    discount=0.99,
                    gae_lambda=1.,
                    inner_lr=0.1,
                    num_grad_updates=1,
                    meta_evaluator=meta_evaluator)

    trainer.setup(algo, env, n_workers=meta_batch_size)
    trainer.train(n_epochs=epochs,
                  batch_size=episodes_per_task * env.spec.max_episode_length)
Exemple #3
0
def mtppo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_workers (int): The number of workers the sampler should use.
        n_tasks (int): Number of tasks to use. Should be a multiple of 50.

    """
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=True)
    assert n_tasks % 50 == 0
    assert n_tasks <= 2500
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=n_workers)

    algo = PPO(env_spec=env.spec,
               policy=policy,
               value_function=value_function,
               sampler=sampler,
               discount=0.99,
               gae_lambda=0.95,
               center_adv=True,
               lr_clip_range=0.2)

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
def sparse_mlp_mtsac_metaworld_mt1_pick_place(ctxt=None, *, seed, timesteps, _gpu):
    """Train MTSAC with the MT1 pick-place-v1 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    deterministic.set_seed(seed)
    mt1 = metaworld.MT1('pick-place-v1')
    mt1_test = metaworld.MT1('pick-place-v1')
    train_task_sampler = MetaWorldTaskSampler(mt1, 'train',
                                              lambda env, _: normalize(env))
    test_task_sampler = MetaWorldTaskSampler(mt1_test, 'train',
                                             lambda env, _: normalize(env))
    n_tasks = 50
    train_envs = train_task_sampler.sample(n_tasks)
    env = train_envs[0]()
    test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    trainer = Trainer(ctxt)

    policy = TanhGaussianSparseMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        linear_activity_percent_on=(0.1, 0.1, 0.1),
        linear_weight_percent_on=(0.4, 0.4, 0.4),
        mean_nonlinearity=None,
        std_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousSparseMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(400, 400, 400),
        linear_activity_percent_on=(0.1, 0.1, 0.1,),
        linear_weight_percent_on=(0.4, 0.4, 0.4,),
    )

    qf2 = ContinuousSparseMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(400, 400, 400),
        linear_activity_percent_on=(0.1, 0.1, 0.1),
        linear_weight_percent_on=(0.4, 0.4, 0.4),
    )

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    sampler = LocalSampler(agents=policy,
                           envs=train_envs,
                           max_episode_length=env.spec.max_episode_length,
                           n_workers=n_tasks,
                           worker_class=FragmentWorker)

    batch_size = int(env.spec.max_episode_length * n_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = SparseWeightsMTSAC(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        sampler=sampler,
        gradient_steps_per_itr=150,
        eval_env=test_envs,
        env_spec=env.spec,
        num_tasks=1,
        steps_per_epoch=epoch_cycles,
        replay_buffer=replay_buffer,
        min_buffer_size=1500,
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    trainer.setup(algo=mtsac, env=train_envs)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
def rl2_ppo_metaworld_ml1_push(ctxt, seed, meta_batch_size, n_epochs,
                               episode_per_task):
    """Train RL2 PPO with ML1 environment.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    ml1 = metaworld.ML1('push-v1')

    task_sampler = MetaWorldTaskSampler(ml1, 'train',
                                        lambda env, _: RL2Env(env))
    env = task_sampler.sample(1)[0]()
    test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                       env=MetaWorldSetTaskEnv(ml1, 'test'),
                                       wrapper=lambda env, _: RL2Env(env))
    env_spec = env.spec

    with TFTrainer(snapshot_config=ctxt) as trainer:
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2PPO(meta_batch_size=meta_batch_size,
                      task_sampler=task_sampler,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(batch_size=32,
                                          max_optimization_epochs=10),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      meta_evaluator=meta_evaluator,
                      episodes_per_trial=episode_per_task)

        trainer.setup(algo,
                      task_sampler.sample(meta_batch_size),
                      sampler_cls=LocalSampler,
                      n_workers=meta_batch_size,
                      worker_class=RL2Worker,
                      worker_args=dict(n_episodes_per_trial=episode_per_task))

        trainer.train(n_epochs=n_epochs,
                      batch_size=episode_per_task *
                      env_spec.max_episode_length * meta_batch_size)
Exemple #6
0
def te_ppo_mt10(ctxt, seed, n_epochs, batch_size_per_task, n_tasks):
    """Train Task Embedding PPO with PointEnv.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        n_epochs (int): Total number of epochs for training.
        batch_size_per_task (int): Batch size of samples for each task.
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.

    """
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=False)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    latent_length = 4
    inference_window = 6
    batch_size = batch_size_per_task * len(envs)
    policy_ent_coeff = 2e-2
    encoder_ent_coeff = 2e-4
    inference_ce_coeff = 5e-2
    embedding_init_std = 0.1
    embedding_max_std = 0.2
    embedding_min_std = 1e-6
    policy_init_std = 1.0
    policy_max_std = None
    policy_min_std = None

    with TFTrainer(snapshot_config=ctxt) as trainer:

        task_embed_spec = TEPPO.get_encoder_spec(env.task_space,
                                                 latent_dim=latent_length)

        task_encoder = GaussianMLPEncoder(
            name='embedding',
            embedding_spec=task_embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=embedding_init_std,
            max_std=embedding_max_std,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        traj_embed_spec = TEPPO.get_infer_spec(
            env.spec,
            latent_dim=latent_length,
            inference_window_size=inference_window)

        inference = GaussianMLPEncoder(
            name='inference',
            embedding_spec=traj_embed_spec,
            hidden_sizes=(20, 10),
            std_share_network=True,
            init_std=2.0,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        policy = GaussianMLPTaskEmbeddingPolicy(
            name='policy',
            env_spec=env.spec,
            encoder=task_encoder,
            hidden_sizes=(32, 16),
            std_share_network=True,
            max_std=policy_max_std,
            init_std=policy_init_std,
            min_std=policy_min_std,
        )

        baseline = LinearMultiFeatureBaseline(
            env_spec=env.spec, features=['observations', 'tasks', 'latents'])

        sampler = LocalSampler(agents=policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=True,
                               worker_class=TaskEmbeddingWorker)

        algo = TEPPO(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     sampler=sampler,
                     inference=inference,
                     discount=0.99,
                     lr_clip_range=0.2,
                     policy_ent_coeff=policy_ent_coeff,
                     encoder_ent_coeff=encoder_ent_coeff,
                     inference_ce_coeff=inference_ce_coeff,
                     use_softplus_entropy=True,
                     optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                         learning_rate=1e-3,
                     ),
                     inference_optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                     ),
                     center_adv=True,
                     stop_ce_gradient=True)

        trainer.setup(algo, env)
        trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def mtppo_metaworld_mt10(ctxt, experiment_name, config_pth, seed, n_workers, n_tasks, use_wandb,
                         wandb_username, use_gpu):
    """Set up environment and algorithm and run the task.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_workers (int): The number of workers the sampler should use.
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
    """
    params = get_params(config_pth)
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=True)

    if use_wandb == 'True':
        use_wandb = True
        wandb.init(
            name=experiment_name,
            entity=wandb_username,
            project="mt10",
            group="Baselines{}".format("mt10"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    assert n_tasks % 10 == 0
    assert n_tasks <= 500

    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = envs[0]

    policy = create_policy_net(env_spec=env.spec,
                               net_params=params["net"]
                               )
    value_function = create_vf_net(env_spec=env.spec,
                                   net_params=params["net"]
                                   )

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=n_workers,
                         worker_class=DefaultWorker)

    gpu_training = True if use_gpu else False

    algo = CustomMTPPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       sampler=sampler,
                       train_task_sampler=train_task_sampler,
                       num_tasks=n_tasks,
                       task_update_frequency=params["training"]["task_update_frequency"],
                       num_eval_eps=params["general_setting"]["eval_episodes"],
                       policy_lr=params["training"]["policy_lr"],
                       vf_lr=params["training"]["vf_lr"],
                       ppo_eps=params["training"]["ppo_eps"],
                       minibatch_size=params["training"]["minibatch_size"],
                       ppo_epochs=params["training"]["ppo_epochs"],
                       num_train_per_epoch=params["training"]["num_train_per_epoch"],
                       discount=params["general_setting"]["discount"],
                       gae_lambda=params["training"]["gae_lambda"],
                       center_adv=False,
                       wandb_logging=use_wandb,
                       eval_freq=params["general_setting"]["eval_freq"],
                       stop_entropy_gradient=True,
                       entropy_method='max',
                       gpu_training=gpu_training
                       )

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=params["training"]["epochs"],
                  batch_size=params["training"]["batch_episodes_per_task"],
                  plot=False)