def test_rl2_ppo_pendulum(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task) runner.setup( algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=self.episode_per_task)) last_avg_ret = runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size) assert last_avg_ret > -40
def test_rl2_ppo_pendulum_wrong_worker(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: with pytest.raises(ValueError): algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task, flatten_input=False) runner.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size) runner.train(n_epochs=10, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size)
def test_rl2_ppo_pendulum_adapted_policy(self): with LocalTFRunner(snapshot_config, sess=self.sess): algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task) exploration_policy = algo.get_exploration_policy() adapted_policy = algo.adapt_policy(exploration_policy, []) (params, hidden) = adapted_policy.get_param_values() expected_new_params = np.zeros_like(params) expected_hidden = np.zeros_like(hidden) adapted_policy.set_param_values( (expected_new_params, expected_hidden)) (new_params, new_hidden) = adapted_policy.get_param_values() assert np.array_equal(expected_new_params, new_params) assert np.array_equal(expected_hidden, new_hidden)
def rl2_ppo_metaworld_ml1_push(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=mwb.ML1.get_train_tasks('push-v1'))) env_spec = RL2Env(env=mwb.ML1.get_train_tasks('push-v1')).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def tf_ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.002, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=4096, plot=False)
def test_rl2_ppo_pendulum_meta_test(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: meta_evaluator = MetaEvaluator( test_task_sampler=self.tasks, n_exploration_traj=10, n_test_rollouts=10, max_path_length=self.max_path_length, n_test_tasks=1) algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker) last_avg_ret = runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size) assert last_avg_ret > -40
def run_metarl(env, envs, tasks, seed, log_dir): """Create metarl Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) with LocalTFRunner(snapshot_config) as runner: policy = GaussianGRUPolicy( hidden_dims=hyper_parameters['hidden_sizes'], env_spec=env.spec, state_include_action=False) baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec) inner_algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'] * hyper_parameters['rollout_per_task'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( max_epochs=hyper_parameters['optimizer_max_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['optimizer_lr'], ), ) ) # Need to pass this if meta_batch_size < num_of_tasks task_names = list(ML45_ENVS['train'].keys()) algo = RL2( policy=policy, inner_algo=inner_algo, max_path_length=hyper_parameters['max_path_length'], meta_batch_size=hyper_parameters['meta_batch_size'], task_sampler=tasks, task_names=None if hyper_parameters['meta_batch_size'] >= len(task_names) else task_names) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') text_log_file = osp.join(log_dir, 'debug.log') dowel_logger.add_output(dowel.TextOutput(text_log_file)) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup( algo, envs, sampler_cls=hyper_parameters['sampler_cls'], n_workers=hyper_parameters['meta_batch_size'], worker_class=RL2Worker, sampler_args=dict( use_all_workers=hyper_parameters['use_all_workers']), worker_args=dict( n_paths_per_trial=hyper_parameters['rollout_per_task'])) # meta evaluator env_obs_dim = [env().observation_space.shape[0] for (_, env) in ML45_ENVS['test'].items()] max_obs_dim = max(env_obs_dim) ML_test_envs = [ TaskIdWrapper(NormalizedRewardEnv(RL2Env(env(*ML45_ARGS['test'][task]['args'], **ML45_ARGS['test'][task]['kwargs']), max_obs_dim)), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML45_ENVS['test'].items()) ] test_tasks = task_sampler.EnvPoolSampler(ML_test_envs) test_tasks.grow_pool(hyper_parameters['n_test_tasks']) test_task_names = list(ML45_ENVS['test'].keys()) runner.setup_meta_evaluator(test_task_sampler=test_tasks, n_exploration_traj=hyper_parameters['rollout_per_task'], n_test_rollouts=hyper_parameters['test_rollout_per_task'], n_test_tasks=hyper_parameters['n_test_tasks'], n_workers=hyper_parameters['n_test_tasks'], test_task_names=None if hyper_parameters['n_test_tasks'] >= len(test_task_names) else test_task_names) runner.train(n_epochs=hyper_parameters['n_itr'], batch_size=hyper_parameters['meta_batch_size'] * hyper_parameters['rollout_per_task'] * hyper_parameters['max_path_length']) dowel_logger.remove_all() return tabular_log_file
def run_metarl(env, seed, log_dir): """Create metarl Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) with LocalTFRunner(snapshot_config) as runner: env, task_samplers = _prepare_meta_env(env) policy = GaussianGRUPolicy( hidden_dims=hyper_parameters['hidden_sizes'], env_spec=env.spec, state_include_action=False) baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec) inner_algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'] * hyper_parameters['rollout_per_task'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( max_epochs=hyper_parameters['optimizer_max_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['optimizer_lr'], ), )) algo = RL2(policy=policy, inner_algo=inner_algo, max_path_length=hyper_parameters['max_path_length'], meta_batch_size=hyper_parameters['meta_batch_size'], task_sampler=task_samplers) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') text_log_file = osp.join(log_dir, 'debug.log') dowel_logger.add_output(dowel.TextOutput(text_log_file)) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup( algo, task_samplers.sample(hyper_parameters['meta_batch_size']), sampler_cls=hyper_parameters['sampler_cls'], n_workers=hyper_parameters['meta_batch_size'], worker_class=RL2Worker, sampler_args=dict( use_all_workers=hyper_parameters['use_all_workers']), worker_args=dict( n_paths_per_trial=hyper_parameters['rollout_per_task'])) runner.setup_meta_evaluator( test_task_sampler=task_samplers, n_exploration_traj=hyper_parameters['rollout_per_task'], n_test_rollouts=hyper_parameters['test_rollout_per_task'], n_test_tasks=hyper_parameters['n_test_tasks'], n_workers=hyper_parameters['n_test_tasks']) runner.train(n_epochs=hyper_parameters['n_itr'], batch_size=hyper_parameters['meta_batch_size'] * hyper_parameters['rollout_per_task'] * hyper_parameters['max_path_length']) dowel_logger.remove_all() return tabular_log_file
def rl2_ppo_halfcheetah(ctxt=None, seed=1): """Train PPO with HalfCheetah environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 # ---- For ML1-push from metaworld.benchmarks import ML1 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=ML1.get_train_tasks('push-v1'))) # ---- For HalfCheetahVel # tasks = task_sampler.SetTaskSampler(lambda: RL2Env( # env=HalfCheetahVelEnv())) env_spec = tasks.sample(1)[0]().spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) inner_algo = RL2PPO( env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) algo = RL2(policy=policy, inner_algo=inner_algo, max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def rl2_ppo_metaworld_ml10_meta_test(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML10 environment with meta-test. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: ml10_train_envs = [ RL2Env(mwb.ML10.from_task(task_name)) for task_name in mwb.ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ml10_train_envs) tasks.grow_pool(meta_batch_size) ml10_test_envs = [ RL2Env(mwb.ML10.from_task(task_name)) for task_name in mwb.ML10.get_test_tasks().all_task_names ] test_tasks = task_sampler.EnvPoolSampler(ml10_test_envs) env_spec = ml10_train_envs[0].spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) meta_evaluator = MetaEvaluator(test_task_sampler=test_tasks, n_exploration_traj=10, n_test_rollouts=10, max_path_length=max_path_length, n_test_tasks=5) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)