def setup_class(self): """Init Wrapper with MT10.""" # pylint: disable=import-outside-toplevel from metaworld.benchmarks import MT10 tasks = MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(MT10.from_task(task)) self.env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, metaworld_mt=True)
def setup_class(self): """Init Wrapper with MT10.""" # pylint: disable=import-outside-toplevel from metaworld.benchmarks import MT10 tasks = MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(GymEnv(MT10.from_task(task))) self.task_names = tasks self.env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla', env_names=tasks) self.env_no_onehot = MultiEnvWrapper( envs, sample_strategy=round_robin_strategy, mode='del-onehot')
def te_ppo_mt10(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = MT10.get_train_tasks().all_task_names envs = [normalize(GarageEnv(MT10.from_task(task))) for task in tasks] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='del-onehot') latent_length = 4 inference_window = 6 batch_size = batch_size_per_task * len(tasks) policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 max_episode_length = 100 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with LocalTFRunner(snapshot_config=ctxt) as runner: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, max_episode_length=max_episode_length, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_episode_length=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_episode_length=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)