def maml_trpo_metaworld_ml10(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv( normalize(mwb.ML10.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_episode_length = 100 test_task_names = mwb.ML10.get_test_tasks().all_task_names test_tasks = [ GarageEnv( normalize(mwb.ML10.from_task(task), expected_action_scale=10.)) for task in test_task_names ] test_sampler = EnvPoolSampler(test_tasks) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_episode_length=max_episode_length, n_test_tasks=len(test_task_names)) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_episode_length=max_episode_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_episode_length)
def test_init_with_env_updates(policy, envs): task_sampler = EnvPoolSampler(envs) envs = task_sampler.sample(N_TRAJ) true_workers = WorkerFactory(seed=100, n_workers=N_TRAJ, max_path_length=MAX_PATH_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=VecWorker, worker_args=dict(n_envs=N_TRAJ), max_path_length=MAX_PATH_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, [policy], [envs]) n_samples = 100 true_trajs = true_sampler.obtain_samples(0, n_samples, None) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def pearl_metaworld_ml10(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=10, num_test_tasks=5, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=150, reward_scale=10., use_gpu=False): """Train PEARL with ML10 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ML_train_envs = [ GarageEnv(normalize(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_train_tasks().all_task_names ] ML_test_envs = [ GarageEnv(normalize(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_test_tasks().all_task_names ] env_sampler = EnvPoolSampler(ML_train_envs) env_sampler.grow_pool(num_train_tasks) env = env_sampler.sample(num_train_tasks) test_env_sampler = EnvPoolSampler(ML_test_envs) test_env_sampler.grow_pool(num_test_tasks) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=num_epochs, batch_size=batch_size)
def diayn_pearl_half_cheeth( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): if task_proposer is None: raise ValueError("Task proposer is empty") assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent) # for train_env in ML_train_envs] # ML_test_envs = [ # GarageEnv(normalize( # DiaynEnvWrapper(env, task_proposer, skills_num, task_name))) # for task_name in random.sample(range(skills_num), test_tasks_num) # ] test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def meta_kant_cheetah_vel( ctxt=None, seed=seed, num_skills=skills_num, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, is_encoder_recurrent=False, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, num_skills_sample=param_num_skills_sample, num_skills_reason_steps=param_num_skills_reason_steps, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, skills_reason_reward_scale=param_skills_reason_reward_scale, tasks_adapt_reward_scale=param_tasks_adapt_reward_scale, use_gpu=param_use_gpu): assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) qf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, "qf") qf = ContinuousMLPQFunction(env_spec=qf_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) controller_policy_env = MetaKant.get_env_spec(env[0](), latent_size, module="controller_policy", num_skills=num_skills) controller_policy = CategoricalMLPPolicy( env_spec=controller_policy_env, hidden_sizes=[net_size, net_size], hidden_nonlinearity=functional.relu) metakant = MetaKant( env=env, skill_env=skill_env, controller_policy=controller_policy, skill_actor=skill_actor, qf=qf, vf=vf, num_skills=num_skills, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, sampler_class=LocalSkillSampler, is_encoder_recurrent=is_encoder_recurrent, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_per_epoch=num_steps_per_epoch, num_steps_prior=num_steps_prior, # num_steps_posterior num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, num_skills_reason_steps=num_skills_reason_steps, num_skills_sample=num_skills_sample, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, skills_reason_reward_scale=skills_reason_reward_scale, tasks_adapt_reward_scale=tasks_adapt_reward_scale) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: metakant.to() worker_args = dict(num_skills=num_skills, skill_actor_class=type(skill_actor), controller_class=OpenContextConditionedControllerPolicy, deterministic=False, accum_context=True) runner.setup(algo=metakant, env=env[0](), sampler_cls=LocalSkillSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=KantWorker, worker_args=worker_args) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns