def test_identical_environments(): def helper(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] np.testing.assert_equal(rand_vec_1, rand_vec_2) def helper_neq(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] assert not (rand_vec_1 == rand_vec_2).all() #testing MT1 mt1_1 = metaworld.MT1('sweep-into-v2', seed=10) mt1_2 = metaworld.MT1('sweep-into-v2', seed=10) helper(mt1_1, mt1_2) #testing ML1 ml1_1 = metaworld.ML1('sweep-into-v2', seed=10) ml1_2 = metaworld.ML1('sweep-into-v2', seed=10) helper(ml1_1, ml1_2) #testing MT10 mt10_1 = metaworld.MT10(seed=10) mt10_2 = metaworld.MT10(seed=10) helper(mt10_1, mt10_2) # testing ML10 ml10_1 = metaworld.ML10(seed=10) ml10_2 = metaworld.ML10(seed=10) helper(ml10_1, ml10_2) #testing ML45 ml45_1 = metaworld.ML45(seed=10) ml45_2 = metaworld.ML45(seed=10) helper(ml45_1, ml45_2) #testing MT50 mt50_1 = metaworld.MT50(seed=10) mt50_2 = metaworld.MT50(seed=10) helper(mt50_1, mt50_2) # test that 2 benchmarks with different seeds have different goals mt50_3 = metaworld.MT50(seed=50) helper_neq(mt50_1, mt50_3)
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml45 = metaworld.ML45() # pylint: disable=missing-return-doc,missing-return-type-doc def wrap(env, _): return normalize(env, expected_action_scale=10.0) train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap) test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=wrap) env = train_task_sampler.sample(45)[0]() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, task_sampler=train_task_sampler, policy=policy, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def rl2_ppo_metaworld_ml45(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML45 environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) ml45 = metaworld.ML45() tasks = MetaWorldTaskSampler(ml45, 'train', lambda env, _: RL2Env(env)) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml45, 'test'), wrapper=lambda env, _: RL2Env(env)) with TFTrainer(snapshot_config=ctxt) as trainer: env = tasks.sample(45)[0]() env_spec = env.spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler, n_exploration_eps=10, n_test_episodes=10, n_test_tasks=5) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, meta_evaluator=meta_evaluator, episodes_per_trial=10) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * env_spec.max_episode_length * meta_batch_size)
from metaworld.policies.sawyer_door_lock_v1_policy import SawyerDoorLockV1Policy import metaworld import random from utils import test_policy ml45 = metaworld.ML45() name = "door-lock-v1" env_cls = ml45.test_classes[name] policy = SawyerDoorLockV1Policy() all_tasks = [task for task in ml45.test_tasks if task.env_name == name] env = env_cls() query_task = random.choice(all_tasks[25:]) env.set_task(query_task) env.max_path_length = 200 test_policy(env, policy, render=True, stop=False)
type=str, default='./models', help='Path to the output folder for saving the model (optional).') parser.add_argument( '--batch-size', type=int, default=16, help='Number of tasks in a mini-batch of tasks (default: 16).') parser.add_argument('--use-cuda', action='store_true', help='Use CUDA if available.') args = parser.parse_args() args.device = torch.device( 'cuda' if args.use_cuda and torch.cuda.is_available() else 'cpu') ml45 = metaworld.ML45() # Construct the benchmark, sampling tasks # Test tasks # custom_tasks = ["bin-picking-v1", "box-close-v1", "hand-insert-v1", "door-lock-v1", "door-unlock-v1"] # policies = {"bin-picking-v1": SawyerBinPickingV2Policy(), # "box-close-v1": SawyerBoxCloseV1Policy(), # "hand-insert-v1":SawyerHandInsertPolicy(), # "door-lock-v1": SawyerDoorLockV1Policy(), # "door-unlock-v1": SawyerDoorUnlockV1Policy()} # ml_custom = {name: ml45.test_classes[name] for name in custom_tasks if name in ml45.test_classes} # Define model model = MIL() model.to(device=args.device) load_model(model, "./models/mil_499.th") model.train()
def pearl_metaworld_ml45(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=45, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, reward_scale=10., use_gpu=False): """Train PEARL with ML45 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ml45 = metaworld.ML45() train_env = MetaWorldSetTaskEnv(ml45, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml45, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) trainer = Trainer(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) sampler = LocalSampler(agents=None, envs=env[0](), max_episode_length=env[0]().spec.max_episode_length, n_workers=1, worker_class=PEARLWorker) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, sampler=sampler, num_train_tasks=num_train_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() trainer.setup(algo=pearl, env=env[0]()) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
def __init__( self, benchmark_name: str, save_memory: bool = False, add_observability: bool = False, ) -> None: """ Init function for environment wrapper. """ # We import here so that we avoid importing metaworld if possible, since it is # dependent on mujoco. import metaworld from metaworld import Task # Set config for each benchmark. if benchmark_name.startswith("MT1_"): env_name = benchmark_name[4:] benchmark = metaworld.MT1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = False elif benchmark_name == "MT10": benchmark = metaworld.MT10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name == "MT50": benchmark = metaworld.MT50() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name.startswith("ML1_train_"): env_name = benchmark_name[10:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_train": benchmark = metaworld.ML10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_train": benchmark = metaworld.ML45() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name.startswith("ML1_test_"): env_name = benchmark_name[9:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.test_classes[env_name]} tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_test": benchmark = metaworld.ML10() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_test": benchmark = metaworld.ML45() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True else: raise NotImplementedError # Construct list of tasks for each environment, adding observability to tasks if # necessary. env_tasks = {} for task in tasks: if add_observability: task_data = dict(pickle.loads(task.data)) task_data["partially_observable"] = False task = Task(env_name=task.env_name, data=pickle.dumps(task_data)) if task.env_name in env_tasks: if resample_tasks: env_tasks[task.env_name].append(task) else: env_tasks[task.env_name] = [task] # Construct list of environment classes or class instances. self.save_memory = save_memory if self.save_memory: self.envs_info = [{ "env_name": env_name, "env_cls": env_cls, "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] else: self.envs_info = [{ "env_name": env_name, "env": env_cls(), "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] self.num_tasks = len(self.envs_info) # Sample environment. self._sample_environment()
# Environment steps. for step in range(EPISODE_LEN): a = env.action_space.sample() obs, reward, done, info = env.step(a) return goals, hand_poses, obj_poses # Set random seed. random.seed(SEED) np.random.seed(SEED) # Create kwargs list for ML45_train and ML_45 test. kwargs_list = [] benchmark = metaworld.ML45() kwargs_list.append({ "env_dict": benchmark.train_classes, "tasks": benchmark.train_tasks, "resample_tasks": True, "add_observability": True, }) kwargs_list.append({ "env_dict": benchmark.test_classes, "tasks": benchmark.test_tasks, "resample_tasks": True, "add_observability": True, }) # Get list of goals, initial hand positions, and initial object positions for each task. goals = {}