def te_ppo_mt50(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = MT50.get_train_tasks().all_task_names envs = [ normalize(GymEnv(MT50.from_task(task), max_episode_length=150)) for task in tasks ] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='del-onehot') latent_length = 6 inference_window = 6 batch_size = batch_size_per_task * len(tasks) policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with LocalTFRunner(snapshot_config=ctxt) as runner: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def te_ppo_mt1_push(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env), add_env_onehot=False) envs = [env_up() for env_up in task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * n_tasks policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with TFTrainer(snapshot_config=ctxt) as trainer: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def te_ppo_pointenv(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = TASKS latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * len(TASKS) policy_ent_coeff = 1e-3 encoder_ent_coeff = 1e-3 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = 2.0 policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] with TFTrainer(snapshot_config=ctxt) as trainer: task_envs = [ PointEnv(*t_args, **t_kwargs, max_episode_length=100) for t_args, t_kwargs in zip(task_args, task_kwargs) ] env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, std_output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=0.1, output_nonlinearity=tf.nn.tanh, std_output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=TaskEmbeddingWorker) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)