def trpo_pendulum(ctxt=None, seed=1): """Train TRPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = GymEnv('InvertedDoublePendulum-v2') trainer = Trainer(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, discount=0.99, center_adv=False) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=1024)
def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') tasks = MetaWorldTaskSampler(ml1, 'train') env = tasks.sample(1)[0]() test_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, n_test_tasks=1, n_exploration_eps=rollouts_per_task) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=tasks, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=rollouts_per_task * env.spec.max_episode_length)
def bc_point(ctxt=None, loss='log_prob'): """Run Behavioral Cloning on garage.envs.PointEnv. Args: ctxt (ExperimentContext): Provided by wrap_experiment. loss (str): Either 'log_prob' or 'mse' """ trainer = Trainer(ctxt) goal = np.array([1., 1.]) env = PointEnv(goal=goal, max_episode_length=200) expert = OptimalPolicy(env.spec, goal=goal) policy = GaussianMLPPolicy(env.spec, [8, 8]) batch_size = 1000 sampler = RaySampler(agents=expert, envs=env, max_episode_length=env.spec.max_episode_length) algo = BC(env.spec, policy, batch_size=batch_size, source=expert, sampler=sampler, policy_lr=1e-2, loss=loss) trainer.setup(algo, env) trainer.train(100, batch_size=batch_size)
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) trainer = Trainer(snapshot_config=ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1000, batch_size=1000)
def alg_train(ctxt=None): get_args(parser) args = parser.parse_args() args.prefix = use_prefix set_seed(args.seed) env = GymEnv(args.env_name) if args.env_norm: env = normalize(env) trainer = Trainer(ctxt) logger.remove_all() logger.add_output(StdLogger(args.log_interval)) if not args.no_wb: wb_logger = WbOutput(args.log_interval, base_args) logger.add_output(wb_logger) algo = get_algo(env, trainer, args) if args.cuda: set_gpu_mode(True) algo.to() else: set_gpu_mode(False) trainer.train(n_epochs=args.n_epochs, batch_size=args.batch_size)
def train_sac(ctxt=None): trainer = Trainer(ctxt) env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False).to( global_device()) qf1 = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) qf2 = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) self.algo = LoggedSAC(env=env, env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=100, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def test_maml_trpo_dummy_named_env(): """Test with dummy environment that has env_name.""" env = normalize(GymEnv(DummyMultiTaskBoxEnv(), max_episode_length=100), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32)) task_sampler = SetTaskSampler( DummyMultiTaskBoxEnv, wrapper=lambda env, _: normalize(GymEnv(env, max_episode_length=100), expected_action_scale=10.)) episodes_per_task = 2 max_episode_length = env.spec.max_episode_length trainer = Trainer(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, task_sampler=task_sampler, value_function=value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=2, batch_size=episodes_per_task * max_episode_length)
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) trainer.setup(mtsac, env) trainer.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml45 = metaworld.ML45() # pylint: disable=missing-return-doc,missing-return-type-doc def wrap(env, _): return normalize(env, expected_action_scale=10.0) train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap) test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=wrap) env = train_task_sampler.sample(45)[0]() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, task_sampler=train_task_sampler, policy=policy, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def maml_trpo_half_cheetah_dir(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) max_episode_length = 100 env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, n_test_tasks=1, n_test_episodes=10) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, task_sampler=task_sampler, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def mtppo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 50. """ set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) assert n_tasks % 50 == 0 assert n_tasks <= 2500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def ppo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv(env_id)) policy = PyTorch_GMP(env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_setup_no_sampler(): trainer = Trainer(snapshot_config) class SupervisedAlgo: def train(self, trainer): # pylint: disable=undefined-loop-variable for epoch in trainer.step_epochs(): pass assert epoch == 4 trainer.setup(SupervisedAlgo(), None) trainer.train(n_epochs=5)
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. lr (float): Learning rate for policy optimization. """ set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99}) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) trainer.setup(algo=ddpg, env=env) trainer.train(n_epochs=500, batch_size=100)
def test_sac_to(): """Test moving Sac between CPU and GPU.""" env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1, batch_size=100) log_alpha = torch.clone(sac._log_alpha).cpu() set_gpu_mode(False) sac.to() assert torch.allclose(log_alpha, sac._log_alpha)
def tutorial_vpg(ctxt=None): """Train VPG with PointEnv environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. """ set_seed(100) trainer = Trainer(ctxt) env = PointEnv() policy = GaussianMLPPolicy(env.spec) algo = SimpleVPG(env.spec, policy) trainer.setup(algo, env) trainer.train(n_epochs=200, batch_size=4000)
def dqn_cartpole(ctxt=None, seed=24): """Train DQN with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) runner = Trainer(ctxt) n_epochs = 100 steps_per_epoch = 10 sampler_batch_size = 512 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, discount=0.9, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64) runner.setup(algo, env) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) env.close()
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def test_set_plot(self): trainer = Trainer(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, sampler=self.sampler, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) trainer.setup(algo, self.env) trainer.train(n_epochs=1, batch_size=100, plot=True) assert isinstance( trainer._plotter, Plotter), ('self.plotter in Trainer should be set to Plotter.')
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) episodes_per_task = 5 max_episode_length = self.env.spec.max_episode_length task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, n_test_tasks=1, n_test_episodes=10) trainer = Trainer(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, task_sampler=self.task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) episodes_per_task = 5 max_episode_length = self.env.spec.max_episode_length trainer = Trainer(snapshot_config) algo = MAMLPPO(env=self.env, policy=self.policy, sampler=self.sampler, task_sampler=self.task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) trainer.setup(algo, self.env) last_avg_ret = trainer.train(n_epochs=10, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5
def vpg_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch VPG model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv(env_id)) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) algo = PyTorch_VPG(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=100, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=1, fixed_alpha=np.exp(0.5)) trainer.setup(sac, env) sac.to() trainer.train(n_epochs=1, batch_size=100, plot=False) assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu()) assert not sac._use_automatic_entropy_tuning
def test_td3_inverted_double_pendulum(self): deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(snapshot_config=snapshot_config) env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() trainer.setup(td3, env) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def train_dqn(ctxt=None): set_seed(seed) trainer = Trainer(ctxt) env = MyGymEnv(gym_env, max_episode_length=100) steps_per_epoch = 10 sampler_batch_size = 4000 num_timesteps = n_eps * steps_per_epoch * sampler_batch_size replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4, ) sampler = LocalSampler( agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedDQN( env=env, env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, discount=0.99, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64, ) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=sampler_batch_size) return self.algo.rew_chkpts
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() ret = trainer.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 80
def trpo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch TRPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv(env_id)) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PyTorch_TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def bc_point(ctxt=None): """Run Behavioral Cloning on garage.envs.PointEnv. Args: ctxt (ExperimentContext): Provided by wrap_experiment. """ trainer = Trainer(ctxt) goal = np.array([1., 1.]) env = PointEnv(goal=goal, max_episode_length=200) expert = OptimalPolicy(env.spec, goal=goal) policy = DeterministicMLPPolicy(env.spec, hidden_sizes=[8, 8]) batch_size = 1000 algo = BC(env.spec, policy, batch_size=batch_size, source=expert, policy_lr=1e-2, loss='mse') trainer.setup(algo, env) trainer.train(100, batch_size=batch_size)
def trpo_pendulum_ray_sampler(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_dashboard=False) deterministic.set_seed(seed) env = GymEnv('InvertedDoublePendulum-v2') trainer = Trainer(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, discount=0.99, center_adv=False) trainer.setup(algo, env, sampler_cls=RaySampler) trainer.train(n_epochs=100, batch_size=1024)
def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" deterministic.set_seed(0) trainer = Trainer(snapshot_config) algo = TRPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, discount=0.99, gae_lambda=0.98) trainer.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0