def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) max_path_length = 200 env = GarageEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) with tf.Graph().as_default(): with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def test_pickle_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) max_path_length = 200 env = GarageEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = RandomPolicy(env.spec.action_space) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) meta_eval_pickle = cloudpickle.dumps(meta_eval) meta_eval2 = cloudpickle.loads(meta_eval_pickle) meta_eval2.evaluate(algo)
def test_rl2_ppo_pendulum_meta_test(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: meta_evaluator = MetaEvaluator(test_task_sampler=self.tasks, n_exploration_eps=10, n_test_episodes=10, n_test_tasks=1) algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, sampler=self.sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) trainer.setup(algo, self.tasks.sample(self.meta_batch_size)) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(PointEnv) max_path_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = GarageEnv(PointEnv()) algo = OptimalActionInference(env=env, max_path_length=max_path_length) runner.setup(algo, env) meta_eval = MetaEvaluator(runner, test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float(rows[0]['MetaTest/CompletionRate']) < 1.0 assert float(rows[0]['MetaTest/Iteration']) == 0 assert (float(rows[0]['MetaTest/MaxReturn']) >= float( rows[0]['MetaTest/AverageReturn'])) assert (float(rows[0]['MetaTest/AverageReturn']) >= float( rows[0]['MetaTest/MinReturn'])) assert float(rows[1]['MetaTest/Iteration']) == 1
def test_meta_evaluator_n_traj(): set_seed(100) tasks = SetTaskSampler(PointEnv) max_path_length = 200 env = GarageEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) algo = MockAlgo(env, max_path_length, n_traj) runner.setup(algo, env) meta_eval = MetaEvaluator(runner, test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo)
def test_rl2_ppo_pendulum_meta_test(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: meta_evaluator = MetaEvaluator( test_task_sampler=self.tasks, n_exploration_traj=10, n_test_rollouts=10, max_path_length=self.max_path_length, n_test_tasks=1) algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, pg_loss='surrogate_clip', optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker) last_avg_ret = runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size) assert last_avg_ret > -40
def rl2_ppo_halfcheetah_meta_test(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Perform meta-testing on RL2PPO with HalfCheetah environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=tasks, n_exploration_traj=10, n_test_rollouts=10, max_path_length=max_path_length, n_test_tasks=5) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def rl2_ppo_metaworld_ml10_meta_test(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML10 environment with meta-test. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: ml10_train_envs = [ RL2Env(GymEnv(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ml10_train_envs) tasks.grow_pool(meta_batch_size) ml10_test_envs = [ RL2Env(GymEnv(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_test_tasks().all_task_names ] test_tasks = task_sampler.EnvPoolSampler(ml10_test_envs) env_spec = ml10_train_envs[0].spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=test_tasks, n_exploration_eps=10, n_test_episodes=10, max_episode_length=max_episode_length, n_test_tasks=5) algo = RL2PPO(rl2_max_episode_length=max_episode_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_episode_length=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_episode_length=max_episode_length * episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def torch_pearl_ml1_push(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=150, reward_scale=10., use_gpu=False): """Train PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1')))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) worker_args = dict(deterministic=True, accum_context=True) meta_evaluator = MetaEvaluator(test_task_sampler=test_env_sampler, max_path_length=max_path_length, worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=num_test_tasks) pearl.evaluator = meta_evaluator runner.train(n_epochs=num_epochs, batch_size=batch_size)
def rl2_ppo_halfcheetah_meta_test(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Perform meta-testing on RL2PPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: max_episode_length = 150 tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=tasks, n_exploration_eps=10, n_test_episodes=10, n_test_tasks=5) envs = tasks.sample(meta_batch_size) sampler = LocalSampler( agents=policy, envs=envs, max_episode_length=env_spec.max_episode_length, is_tf_worker=True, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) trainer.setup(algo, envs) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_sizes=[200, 200, 200], net_size=300, meta_batch_size=16, num_steps_per_epoch=400, num_initial_steps=400, num_tasks_sample=15, num_steps_prior=150, num_extra_rl_steps_posterior=150, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=50, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) env_sampler = SetTaskSampler(lambda: GarageEnv( normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler(lambda: GarageEnv( normalize(ML1.get_test_tasks('push-v1')))) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() runner = LocalRunner(snapshot_config) runner.setup( algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=params['max_path_length']), n_workers=1, worker_class=PEARLWorker) worker_args = dict(deterministic=True, accum_context=True) meta_evaluator = MetaEvaluator( test_task_sampler=test_env_sampler, max_path_length=params['max_path_length'], worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=params['num_test_tasks']) pearl.evaluator = meta_evaluator runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])