def test_make_sampler_ray_sampler(self, ray_session_fixture): del ray_session_fixture assert ray.is_initialized() with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) assert isinstance(trainer._sampler, RaySampler) trainer.train(n_epochs=1, batch_size=10)
def reps_gym_cartpole(ctxt=None, seed=1): """Train REPS with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000, plot=False)
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def trpo_cartpole_recurrent(ctxt, seed, n_epochs, batch_size, plot): """Train TRPO with a recurrent policy on CartPole. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. n_epochs (int): Number of epochs for training. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Batch size used for training. plot (bool): Whether to plot or not. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1', max_episode_length=100) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=plot)
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env1 = normalize(PointEnv(goal=(-1., 0.), max_episode_length=100)) env2 = normalize(PointEnv(goal=(1., 0.), max_episode_length=100)) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=2048, plot=False)
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with TFTrainer(snapshot_config) as trainer: env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: deterministic.set_seed(1) env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 60 env.close()
def train_gru_trpo(ctxt=None): set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedTRPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def test_trpo_lstm_cartpole(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CartPole-v1', max_episode_length=100)) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) snapshotter.snapshot_dir = './' trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 60 env.close()
def fixture_exp(snapshot_config, sess): """Dummy fixture experiment function. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by Trainer to create the snapshotter. If None, it will create one with default settings. sess (tf.Session): An optional TensorFlow session. A new session will be created immediately if not provided. Returns: np.ndarray: Values of the parameters evaluated in the current session """ with TFTrainer(snapshot_config=snapshot_config, sess=sess) as trainer: env = GymEnv('CartPole-v1', max_episode_length=100) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def test_te_ppo(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length, is_tf_worker=True, worker_class=TaskEmbeddingWorker) algo = TEPPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, inference=self.inference, sampler=sampler, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=self.policy_ent_coeff, encoder_ent_coeff=self.encoder_ent_coeff, inference_ce_coeff=self.inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, self.env) trainer.train(n_epochs=1, batch_size=self.batch_size, plot=False)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e5)) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def test_set_plot(self): deterministic.set_seed(1) with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=100, plot=True) assert isinstance(trainer._plotter, Plotter), ( 'self.plotter in TFTrainer should be set to Plotter.')
def test_train(self): with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=100)
def train_ppo(ctxt=None): set_seed(seed) with TFTrainer(ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, is_tf_worker=True, ) self.algo = LoggedPPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, center_adv=False, optimizer_args=dict(max_optimization_epochs=8)) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def test_rl2_ppo_pendulum_meta_test(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: meta_evaluator = MetaEvaluator(test_task_sampler=self.tasks, n_exploration_eps=10, n_test_episodes=10, n_test_tasks=1) algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, sampler=self.sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task, meta_evaluator=meta_evaluator, n_epochs_per_eval=10) trainer.setup(algo, self.tasks.sample(self.meta_batch_size)) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def test_rl2_trpo_pendulum(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = RL2TRPO( meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, episodes_per_trial=self.episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def rl2_ppo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, episodes_per_trial=episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def trpo_cartpole_bullet(ctxt=None, seed=1): """Train TRPO with Pybullet's CartPoleBulletEnv environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = BulletEnv( gym.make('CartPoleBulletEnv-v1', renders=False, discrete_actions=True)) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000)
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 10 sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CEM(env_spec=env.spec, policy=policy, sampler=sampler, best_frac=0.1, n_samples=n_samples) trainer.setup(algo, env) rtn = trainer.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def trpo_swimmer(ctxt=None, seed=1, batch_size=4000): """Train TRPO with Swimmer-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = GymEnv('Swimmer-v2') policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=batch_size)
def cma_es_cartpole(ctxt=None, seed=1): """Train CMA_ES with Cartpole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 20 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CMAES(env_spec=env.spec, policy=policy, sampler=sampler, n_samples=n_samples) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=1000)
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = DMControlEnv.from_suite(*task) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01, ) trainer.setup(algo, env, sampler_cls=LocalSampler) trainer.train(n_epochs=1, batch_size=10) env.close()
def test_rl2_ppo_pendulum_adapted_policy(self): with TFTrainer(snapshot_config, sess=self.sess): algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task) exploration_policy = algo.get_exploration_policy() adapted_policy = algo.adapt_policy(exploration_policy, []) (params, hidden) = adapted_policy.get_param_values() expected_new_params = np.zeros_like(params) expected_hidden = np.zeros_like(hidden) adapted_policy.set_param_values( (expected_new_params, expected_hidden)) (new_params, new_hidden) = adapted_policy.get_param_values() assert np.array_equal(expected_new_params, new_params) assert np.array_equal(expected_hidden, new_hidden)
def cem_cartpole(ctxt=None, seed=1): """Train CEM with Cartpole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 20 algo = CEM(env_spec=env.spec, policy=policy, best_frac=0.05, n_samples=n_samples) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=1000)
def test_rl2_ppo_pendulum_wrong_worker(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: with pytest.raises(ValueError): algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task) trainer.setup(algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size) trainer.train(n_epochs=10, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size)
def trpo_gym_tf_cartpole(ctxt=None, seed=1): """Train TRPO with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01, ) trainer.setup(algo, env) trainer.train(n_epochs=10, batch_size=10000, plot=False)
def test_rl2_ppo_pendulum(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task) trainer.setup( algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=self.episode_per_task)) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def vpg_cartpole(ctxt=None, seed=1): """Train VPG with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=10000)
def test_gaussian_policies(self, policy_cls): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('Pendulum-v0')) policy = policy_cls(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5)), ) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=4000) env.close()