def ppo_setup(env, trainer, args): policy = GaussianMLPPolicy(env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=OptimizerWrapper( (torch.optim.Adam, dict(lr=args.policy_lr)), policy, max_optimization_epochs=args.n_minibatches, minibatch_size=args.minibatch_size), vf_optimizer=OptimizerWrapper( (torch.optim.Adam, dict(lr=args.vf_lr)), value_function, max_optimization_epochs=args.n_minibatches, minibatch_size=args.minibatch_size), **convert_kwargs(args, PPO)) trainer.setup(algo, env, sampler_cls=LocalSampler, worker_class=VecWorker, worker_args={'n_envs': 8}) return algo
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = GymEnv('InvertedDoublePendulum-v2') runner = LocalRunner(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def torch_ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def load_ppo(env_name="CartPole-v0"): """Return an instance of the PPO algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) vfunc = GaussianMLPValueFunction(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, value_function=vfunc) return algo
def mtppo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 50. """ set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) assert n_tasks % 50 == 0 assert n_tasks <= 2500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_episode_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = PPO(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, lr_clip_range=lr_clip_range, max_episode_length=max_episode_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def mtppo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def test_set_plot(self): trainer = Trainer(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, sampler=self.sampler, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) trainer.setup(algo, self.env) trainer.train(n_epochs=1, batch_size=100, plot=True) assert isinstance( trainer._plotter, Plotter), ('self.plotter in Trainer should be set to Plotter.')
def mtppo_metaworld_mt10(ctxt, seed, epochs, batch_size, n_worker): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_worker (int): The number of workers the sampler should use. """ set_seed(seed) tasks = mwb.MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append( normalize(GymEnv(mwb.MT10.from_task(task), max_episode_length=150))) env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner = LocalRunner(ctxt) runner.setup(algo, env, n_workers=n_worker) runner.train(n_epochs=epochs, batch_size=batch_size)
def test_set_plot(self): deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env, sampler_cls=LocalSampler) runner.train(n_epochs=1, batch_size=100, plot=True) assert isinstance( runner._plotter, Plotter), ('self.plotter in LocalRunner should be set to Plotter.')
def test_set_plot(self): deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) runner.train(n_epochs=1, batch_size=100, plot=True) assert isinstance( runner._plotter, Plotter), ('self.plotter in LocalRunner should be set to Plotter.')
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_path_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1): inner_algo = PPO(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, lr_clip_range=lr_clip_range, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length) trainer = Trainer(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, sampler=sampler, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) trainer.setup(algo, self.env) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def mtppo_metaworld_ml1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) env = GarageEnv(normalize(mwb.ML1.get_train_tasks('push-v1'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, max_episode_length=128, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner = LocalRunner(ctxt) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size)