Example #1
0
def test_identical_environments():
    def helper(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            np.testing.assert_equal(rand_vec_1, rand_vec_2)

    def helper_neq(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            assert not (rand_vec_1 == rand_vec_2).all()

    #testing MT1
    mt1_1 = metaworld.MT1('sweep-into-v2', seed=10)
    mt1_2 = metaworld.MT1('sweep-into-v2', seed=10)
    helper(mt1_1, mt1_2)

    #testing ML1
    ml1_1 = metaworld.ML1('sweep-into-v2', seed=10)
    ml1_2 = metaworld.ML1('sweep-into-v2', seed=10)
    helper(ml1_1, ml1_2)

    #testing MT10
    mt10_1 = metaworld.MT10(seed=10)
    mt10_2 = metaworld.MT10(seed=10)
    helper(mt10_1, mt10_2)

    # testing ML10
    ml10_1 = metaworld.ML10(seed=10)
    ml10_2 = metaworld.ML10(seed=10)
    helper(ml10_1, ml10_2)

    #testing ML45
    ml45_1 = metaworld.ML45(seed=10)
    ml45_2 = metaworld.ML45(seed=10)
    helper(ml45_1, ml45_2)

    #testing MT50
    mt50_1 = metaworld.MT50(seed=10)
    mt50_2 = metaworld.MT50(seed=10)
    helper(mt50_1, mt50_2)

    # test that 2 benchmarks with different seeds have different goals
    mt50_3 = metaworld.MT50(seed=50)
    helper_neq(mt50_1, mt50_3)
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task,
                             meta_batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        episodes_per_task (int): Number of episodes per epoch per task
            for training.
        meta_batch_size (int): Number of tasks sampled per batch.

    """
    set_seed(seed)
    ml45 = metaworld.ML45()

    # pylint: disable=missing-return-doc,missing-return-type-doc
    def wrap(env, _):
        return normalize(env, expected_action_scale=10.0)

    train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap)
    test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None)
    test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                       env=test_env,
                                       wrapper=wrap)
    env = train_task_sampler.sample(45)[0]()

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler)

    trainer = Trainer(ctxt)
    algo = MAMLTRPO(env=env,
                    task_sampler=train_task_sampler,
                    policy=policy,
                    value_function=value_function,
                    meta_batch_size=meta_batch_size,
                    discount=0.99,
                    gae_lambda=1.,
                    inner_lr=0.1,
                    num_grad_updates=1,
                    meta_evaluator=meta_evaluator)

    trainer.setup(algo, env, n_workers=meta_batch_size)
    trainer.train(n_epochs=epochs,
                  batch_size=episodes_per_task * env.spec.max_episode_length)
Example #3
0
def rl2_ppo_metaworld_ml45(ctxt, seed, meta_batch_size, n_epochs,
                           episode_per_task):
    """Train PPO with ML45 environment.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    ml45 = metaworld.ML45()
    tasks = MetaWorldTaskSampler(ml45, 'train', lambda env, _: RL2Env(env))
    test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                       env=MetaWorldSetTaskEnv(ml45, 'test'),
                                       wrapper=lambda env, _: RL2Env(env))
    with TFTrainer(snapshot_config=ctxt) as trainer:

        env = tasks.sample(45)[0]()
        env_spec = env.spec

        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler,
                                       n_exploration_eps=10,
                                       n_test_episodes=10,
                                       n_test_tasks=5)

        algo = RL2PPO(meta_batch_size=meta_batch_size,
                      task_sampler=tasks,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(batch_size=32, ),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      meta_evaluator=meta_evaluator,
                      episodes_per_trial=10)

        trainer.setup(algo,
                      tasks.sample(meta_batch_size),
                      sampler_cls=LocalSampler,
                      n_workers=meta_batch_size,
                      worker_class=RL2Worker,
                      worker_args=dict(n_episodes_per_trial=episode_per_task))

        trainer.train(n_epochs=n_epochs,
                      batch_size=episode_per_task *
                      env_spec.max_episode_length * meta_batch_size)
Example #4
0
from metaworld.policies.sawyer_door_lock_v1_policy import SawyerDoorLockV1Policy
import metaworld
import random
from utils import test_policy

ml45 = metaworld.ML45()

name = "door-lock-v1"
env_cls = ml45.test_classes[name]
policy = SawyerDoorLockV1Policy()

all_tasks = [task for task in ml45.test_tasks if task.env_name == name]

env = env_cls()
query_task = random.choice(all_tasks[25:])
env.set_task(query_task)
env.max_path_length = 200
test_policy(env, policy, render=True, stop=False)
Example #5
0
        type=str,
        default='./models',
        help='Path to the output folder for saving the model (optional).')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=16,
        help='Number of tasks in a mini-batch of tasks (default: 16).')
    parser.add_argument('--use-cuda',
                        action='store_true',
                        help='Use CUDA if available.')
    args = parser.parse_args()
    args.device = torch.device(
        'cuda' if args.use_cuda and torch.cuda.is_available() else 'cpu')

    ml45 = metaworld.ML45()  # Construct the benchmark, sampling tasks

    # Test tasks
    # custom_tasks = ["bin-picking-v1", "box-close-v1", "hand-insert-v1", "door-lock-v1", "door-unlock-v1"]
    # policies = {"bin-picking-v1": SawyerBinPickingV2Policy(),
    #             "box-close-v1": SawyerBoxCloseV1Policy(),
    #             "hand-insert-v1":SawyerHandInsertPolicy(),
    #             "door-lock-v1": SawyerDoorLockV1Policy(),
    #             "door-unlock-v1": SawyerDoorUnlockV1Policy()}
    # ml_custom = {name: ml45.test_classes[name] for name in custom_tasks if name in ml45.test_classes}

    # Define model
    model = MIL()
    model.to(device=args.device)
    load_model(model, "./models/mil_499.th")
    model.train()
Example #6
0
def pearl_metaworld_ml45(ctxt=None,
                         seed=1,
                         num_epochs=1000,
                         num_train_tasks=45,
                         latent_size=7,
                         encoder_hidden_size=200,
                         net_size=300,
                         meta_batch_size=16,
                         num_steps_per_epoch=4000,
                         num_initial_steps=4000,
                         num_tasks_sample=15,
                         num_steps_prior=750,
                         num_extra_rl_steps_posterior=750,
                         batch_size=256,
                         embedding_batch_size=64,
                         embedding_mini_batch_size=64,
                         reward_scale=10.,
                         use_gpu=False):
    """Train PEARL with ML45 environments.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        num_epochs (int): Number of training epochs.
        num_train_tasks (int): Number of tasks for training.
        latent_size (int): Size of latent context vector.
        encoder_hidden_size (int): Output dimension of dense layer of the
            context encoder.
        net_size (int): Output dimension of a dense layer of Q-function and
            value function.
        meta_batch_size (int): Meta batch size.
        num_steps_per_epoch (int): Number of iterations per epoch.
        num_initial_steps (int): Number of transitions obtained per task before
            training.
        num_tasks_sample (int): Number of random tasks to obtain data for each
            iteration.
        num_steps_prior (int): Number of transitions to obtain per task with
            z ~ prior.
        num_extra_rl_steps_posterior (int): Number of additional transitions
            to obtain per task with z ~ posterior that are only used to train
            the policy and NOT the encoder.
        batch_size (int): Number of transitions in RL batch.
        embedding_batch_size (int): Number of transitions in context batch.
        embedding_mini_batch_size (int): Number of transitions in mini context
            batch; should be same as embedding_batch_size for non-recurrent
            encoder.
        reward_scale (int): Reward scale.
        use_gpu (bool): Whether or not to use GPU for training.

    """
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    ml45 = metaworld.ML45()
    train_env = MetaWorldSetTaskEnv(ml45, 'train')
    env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                 env=train_env,
                                 wrapper=lambda env, _: normalize(env))
    env = env_sampler.sample(num_train_tasks)
    test_env = MetaWorldSetTaskEnv(ml45, 'test')
    test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                      env=test_env,
                                      wrapper=lambda env, _: normalize(env))

    trainer = Trainer(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    sampler = LocalSampler(agents=None,
                           envs=env[0](),
                           max_episode_length=env[0]().spec.max_episode_length,
                           n_workers=1,
                           worker_class=PEARLWorker)

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        sampler=sampler,
        num_train_tasks=num_train_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        reward_scale=reward_scale,
    )

    set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    trainer.setup(algo=pearl, env=env[0]())

    trainer.train(n_epochs=num_epochs, batch_size=batch_size)
Example #7
0
    def __init__(
        self,
        benchmark_name: str,
        save_memory: bool = False,
        add_observability: bool = False,
    ) -> None:
        """ Init function for environment wrapper. """

        # We import here so that we avoid importing metaworld if possible, since it is
        # dependent on mujoco.
        import metaworld
        from metaworld import Task

        # Set config for each benchmark.
        if benchmark_name.startswith("MT1_"):
            env_name = benchmark_name[4:]
            benchmark = metaworld.MT1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = False

        elif benchmark_name == "MT10":
            benchmark = metaworld.MT10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name == "MT50":
            benchmark = metaworld.MT50()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_train_"):
            env_name = benchmark_name[10:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_train":
            benchmark = metaworld.ML10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_train":
            benchmark = metaworld.ML45()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_test_"):
            env_name = benchmark_name[9:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.test_classes[env_name]}
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_test":
            benchmark = metaworld.ML10()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_test":
            benchmark = metaworld.ML45()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        else:
            raise NotImplementedError

        # Construct list of tasks for each environment, adding observability to tasks if
        # necessary.
        env_tasks = {}
        for task in tasks:
            if add_observability:
                task_data = dict(pickle.loads(task.data))
                task_data["partially_observable"] = False
                task = Task(env_name=task.env_name,
                            data=pickle.dumps(task_data))

            if task.env_name in env_tasks:
                if resample_tasks:
                    env_tasks[task.env_name].append(task)
            else:
                env_tasks[task.env_name] = [task]

        # Construct list of environment classes or class instances.
        self.save_memory = save_memory
        if self.save_memory:
            self.envs_info = [{
                "env_name": env_name,
                "env_cls": env_cls,
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]
        else:
            self.envs_info = [{
                "env_name": env_name,
                "env": env_cls(),
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]

        self.num_tasks = len(self.envs_info)

        # Sample environment.
        self._sample_environment()
Example #8
0
            # Environment steps.
            for step in range(EPISODE_LEN):
                a = env.action_space.sample()
                obs, reward, done, info = env.step(a)

    return goals, hand_poses, obj_poses


# Set random seed.
random.seed(SEED)
np.random.seed(SEED)

# Create kwargs list for ML45_train and ML_45 test.
kwargs_list = []
benchmark = metaworld.ML45()
kwargs_list.append({
    "env_dict": benchmark.train_classes,
    "tasks": benchmark.train_tasks,
    "resample_tasks": True,
    "add_observability": True,
})
kwargs_list.append({
    "env_dict": benchmark.test_classes,
    "tasks": benchmark.test_tasks,
    "resample_tasks": True,
    "add_observability": True,
})

# Get list of goals, initial hand positions, and initial object positions for each task.
goals = {}