def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') tasks = MetaWorldTaskSampler(ml1, 'train') env = tasks.sample(1)[0]() test_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, n_test_tasks=1, n_exploration_eps=rollouts_per_task) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=tasks, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=rollouts_per_task * env.spec.max_episode_length)
def test_identical_environments(): def helper(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] np.testing.assert_equal(rand_vec_1, rand_vec_2) def helper_neq(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] assert not (rand_vec_1 == rand_vec_2).all() #testing MT1 mt1_1 = metaworld.MT1('sweep-into-v2', seed=10) mt1_2 = metaworld.MT1('sweep-into-v2', seed=10) helper(mt1_1, mt1_2) #testing ML1 ml1_1 = metaworld.ML1('sweep-into-v2', seed=10) ml1_2 = metaworld.ML1('sweep-into-v2', seed=10) helper(ml1_1, ml1_2) #testing MT10 mt10_1 = metaworld.MT10(seed=10) mt10_2 = metaworld.MT10(seed=10) helper(mt10_1, mt10_2) # testing ML10 ml10_1 = metaworld.ML10(seed=10) ml10_2 = metaworld.ML10(seed=10) helper(ml10_1, ml10_2) #testing ML45 ml45_1 = metaworld.ML45(seed=10) ml45_2 = metaworld.ML45(seed=10) helper(ml45_1, ml45_2) #testing MT50 mt50_1 = metaworld.MT50(seed=10) mt50_2 = metaworld.MT50(seed=10) helper(mt50_1, mt50_2) # test that 2 benchmarks with different seeds have different goals mt50_3 = metaworld.MT50(seed=50) helper_neq(mt50_1, mt50_3)
def test_forbidden_cases(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld ml1 = metaworld.ML1('push-v1') with pytest.raises(ValueError): MetaWorldSetTaskEnv(ml1, 'train', add_env_onehot=True) with pytest.raises(ValueError): MetaWorldSetTaskEnv(ml1, 'Test')
def test_metaworld_sample_and_step(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld ml1 = metaworld.ML1('push-v1') tasks = task_sampler.MetaWorldTaskSampler(ml1, 'train') updates = tasks.sample(100) assert len(updates) == 100 env = updates[0]() action = env.action_space.sample() env.reset() env.step(action) env.step(action) env.close() updates = tasks.sample(100, with_replacement=True) assert len(updates) == 100
def test_sample_and_step(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld ml1 = metaworld.ML1('push-v1') env = MetaWorldSetTaskEnv(ml1, 'train') assert env.num_tasks == 50 task = env.sample_tasks(1)[0] env.set_task(task) env.step(env.action_space.sample()) env.close() env2 = MetaWorldSetTaskEnv() env2.set_task(task) env2.step(env.action_space.sample()) env2.close() tasks = env.sample_tasks(100) assert len(tasks) == 100
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) # create multi-task environment and sample tasks ml1 = metaworld.ML1('push-v1') train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(params['num_train_tasks']) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler( MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() trainer = Trainer(snapshot_config) trainer.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, n_workers=1, worker_class=PEARLWorker) trainer.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def rl2_ppo_metaworld_ml1_push(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train RL2 PPO with ML1 environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') task_sampler = MetaWorldTaskSampler(ml1, 'train', lambda env, _: RL2Env(env)) env = task_sampler.sample(1)[0]() test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test'), wrapper=lambda env, _: RL2Env(env)) env_spec = env.spec with TFTrainer(snapshot_config=ctxt) as trainer: policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=task_sampler, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_optimization_epochs=10), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, meta_evaluator=meta_evaluator, episodes_per_trial=episode_per_task) trainer.setup(algo, task_sampler.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * env_spec.max_episode_length * meta_batch_size)
def tcl_pearl_ml1(ctxt=None, seed=1, num_epochs=200, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=200, reward_scale=10., replay_buffer_size=1000000, use_next_obs=False, in_sequence_path_aug=True, emphasized_network=False, use_kl_loss=True, use_q_loss=True, encoder_common_net=True, single_alpha=False, use_task_index_label=False, use_wasserstein_distance=True, gpu_id=0, name='push-v1', prefix='curl_fine_tune', use_gpu=True): """Train TCL-PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) print("Running experiences on {}/{}".format(prefix, name)) # create multi-task environment and sample tasks ml1 = metaworld.ML1(name) train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) sampler = LocalSampler(agents=None, envs=env[0](), max_episode_length=max_path_length, n_workers=1, worker_class=TCLPEARLWorker) trainer = Trainer(ctxt) # instantiate networks augmented_env = TCLPEARL.augment_env_spec(env[0](), latent_size) qf_1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf_2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) tcl_pearl = TCLPEARL( env=env, policy_class=TCLPolicy, encoder_class=ContrastiveEncoder, inner_policy=inner_policy, qf1=qf_1, qf2=qf_2, sampler=sampler, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, replay_buffer_size=replay_buffer_size, use_next_obs_in_context=use_next_obs, embedding_batch_in_sequence=in_sequence_path_aug, use_kl_loss=use_kl_loss, use_q_loss=use_q_loss, encoder_common_net=encoder_common_net, single_alpha=single_alpha, use_task_index_label=use_task_index_label, use_wasserstein_distance=use_wasserstein_distance) set_gpu_mode(use_gpu, gpu_id=gpu_id) if use_gpu: tcl_pearl.to() trainer.setup(algo=tcl_pearl, env=env[0]()) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
import metaworld import random bench_name = 'ML1' if bench_name == 'ML1': print(metaworld.ML1.ENV_NAMES) bench = metaworld.ML1('pick-place-v1') elif bench_name == 'ML10': bench = metaworld.ML10() training_envs = [] for name, env_cls in bench.train_classes.items(): env = env_cls() task = random.choice( [task for task in bench.train_tasks if task.env_name == name]) env.set_task(task) training_envs.append(env) print(training_envs) for env in training_envs: obs = env.reset() action = env.action_space.sample() obs, reward, done, info = env.step(action)
def __init__( self, benchmark_name: str, save_memory: bool = False, add_observability: bool = False, ) -> None: """ Init function for environment wrapper. """ # We import here so that we avoid importing metaworld if possible, since it is # dependent on mujoco. import metaworld from metaworld import Task # Set config for each benchmark. if benchmark_name.startswith("MT1_"): env_name = benchmark_name[4:] benchmark = metaworld.MT1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = False elif benchmark_name == "MT10": benchmark = metaworld.MT10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name == "MT50": benchmark = metaworld.MT50() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name.startswith("ML1_train_"): env_name = benchmark_name[10:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_train": benchmark = metaworld.ML10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_train": benchmark = metaworld.ML45() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name.startswith("ML1_test_"): env_name = benchmark_name[9:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.test_classes[env_name]} tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_test": benchmark = metaworld.ML10() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_test": benchmark = metaworld.ML45() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True else: raise NotImplementedError # Construct list of tasks for each environment, adding observability to tasks if # necessary. env_tasks = {} for task in tasks: if add_observability: task_data = dict(pickle.loads(task.data)) task_data["partially_observable"] = False task = Task(env_name=task.env_name, data=pickle.dumps(task_data)) if task.env_name in env_tasks: if resample_tasks: env_tasks[task.env_name].append(task) else: env_tasks[task.env_name] = [task] # Construct list of environment classes or class instances. self.save_memory = save_memory if self.save_memory: self.envs_info = [{ "env_name": env_name, "env_cls": env_cls, "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] else: self.envs_info = [{ "env_name": env_name, "env": env_cls(), "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] self.num_tasks = len(self.envs_info) # Sample environment. self._sample_environment()
def pearl_metaworld_ml1_push(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=50, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, reward_scale=10., use_gpu=False): """Train PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ml1 = metaworld.ML1('push-v1') train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) trainer = Trainer(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) sampler = LocalSampler(agents=None, envs=env[0](), max_episode_length=env[0]().spec.max_episode_length, n_workers=1, worker_class=PEARLWorker) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, sampler=sampler, num_train_tasks=num_train_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() trainer.setup(algo=pearl, env=env[0]()) trainer.train(n_epochs=num_epochs, batch_size=batch_size)