def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = MetaRLEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_path_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = PPO(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, lr_clip_range=lr_clip_range, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def mtppo_metaworld_mt10(ctxt, seed, epochs, batch_size, n_worker): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_worker (int): The number of workers the sampler should use. """ set_seed(seed) tasks = mwb.MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(normalize(MetaRLEnv(mwb.MT10.from_task(task)))) env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=128, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner = LocalRunner(ctxt) runner.setup(algo, env, n_workers=n_worker) runner.train(n_epochs=epochs, batch_size=batch_size)
def test_set_plot(self): deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) runner.train(n_epochs=1, batch_size=100, plot=True) assert isinstance( runner._plotter, Plotter), ('self.plotter in LocalRunner should be set to Plotter.')
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_path_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1): inner_algo = PPO(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, lr_clip_range=lr_clip_range, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates)
def mtppo_metaworld_ml1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) env = MetaRLEnv(normalize(mwb.ML1.get_train_tasks('push-v1'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=128, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner = LocalRunner(ctxt) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size)