def test_in_local_sampler(policy, envs): true_workers = WorkerFactory(seed=100, n_workers=N_TRAJ, max_path_length=MAX_PATH_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=VecWorker, worker_args=dict(n_envs=N_TRAJ), max_path_length=MAX_PATH_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs]) n_samples = 100 true_trajs = true_sampler.obtain_samples(0, n_samples, None) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) # Test start_rollout optimization true_trajs = true_sampler.obtain_samples(0, n_samples, None) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def test_in_local_sampler(policy, envs, other_envs, timesteps_per_call): true_workers = WorkerFactory(seed=100, n_workers=N_EPS, max_episode_length=MAX_EPISODE_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) worker_args = dict(n_envs=N_EPS, timesteps_per_call=timesteps_per_call) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=FragmentWorker, worker_args=worker_args, max_episode_length=MAX_EPISODE_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs]) n_samples = 400 true_eps = true_sampler.obtain_samples(0, n_samples, None) sliced_true_eps = slice_episodes(true_eps, timesteps_per_call) vec_eps = vec_sampler.obtain_samples(0, 50, None) for test_eps in vec_eps.split(): assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps) true_eps = true_sampler.obtain_samples(0, n_samples, None, env_update=other_envs) sliced_true_eps = slice_episodes(true_eps, timesteps_per_call) vec_eps = vec_sampler.obtain_samples(0, 50, None, env_update=[other_envs]) for test_eps in vec_eps.split(): assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def test_onehots_consistent_with_task_sampler(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld mt10 = metaworld.MT10() env = MetaWorldSetTaskEnv(mt10, 'train', add_env_onehot=True) policy = RandomPolicy(env.action_space) workers = WorkerFactory(seed=100, max_episode_length=1, n_workers=10) sampler1 = LocalSampler.from_worker_factory(workers, policy, env) env_ups = [ SetTaskUpdate(MetaWorldSetTaskEnv, task, None) for task in env.sample_tasks(10) ] samples1 = sampler1.obtain_exact_episodes(1, policy, env_ups) task_sampler = MetaWorldTaskSampler(mt10, 'train', add_env_onehot=True) env_ups = task_sampler.sample(10) sampler2 = LocalSampler.from_worker_factory(workers, policy, env_ups) samples2 = sampler2.obtain_exact_episodes(1, policy, env_ups) name_to_obs1 = {} for obs1, name1 in zip(samples1.observations, samples1.env_infos['task_name']): name_to_obs1[name1] = obs1 for obs2, name2 in zip(samples2.observations, samples2.env_infos['task_name']): assert (name_to_obs1[name2][-10:] == obs2[-10:]).all()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = DMControlEnv.from_suite(*task) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, ) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=10) env.close()
def train_ppo(ctxt=None): set_seed(seed) with TFTrainer(ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, is_tf_worker=True, ) self.algo = LoggedPPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, center_adv=False, optimizer_args=dict(max_optimization_epochs=8)) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def trpo_cartpole(ctxt=None, seed=1): """Train TRPO with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000)
def test_categorical_policies(self, policy_cls): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CartPole-v0', max_episode_length=100)) policy = policy_cls(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5)), ) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=4000) env.close()
def test_obtain_exact_episodes(): max_episode_length = 15 n_workers = 8 env = PointEnv() per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_episode_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policies, envs=env) n_eps_per_worker = 3 episodes = sampler.obtain_exact_episodes(n_eps_per_worker, agent_update=policies) # At least one action per episode. assert sum(episodes.lengths) >= n_workers * n_eps_per_worker # All of the episodes. assert len(episodes.lengths) == n_workers * n_eps_per_worker worker = -1 for count, eps in enumerate(episodes.split()): if count % n_eps_per_worker == 0: worker += 1 assert (eps.actions == per_worker_actions[worker]).all()
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) trainer = Trainer(snapshot_config=ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1000, batch_size=1000)
def test_set_plot(self): deterministic.set_seed(1) with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=100, plot=True) assert isinstance(trainer._plotter, Plotter), ( 'self.plotter in TFTrainer should be set to Plotter.')
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = GarageEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, agent_update=policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all()
def test_rl2_ppo_pendulum_wrong_worker(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: with pytest.raises(ValueError): sampler = LocalSampler( agents=self.policy, envs=self.tasks.sample(self.meta_batch_size), max_episode_length=self.env_spec.max_episode_length, is_tf_worker=True, n_workers=self.meta_batch_size) algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task) trainer.setup(algo, self.tasks.sample(self.meta_batch_size)) trainer.train(n_epochs=10, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size)
def test_trpo_lstm_cartpole(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CartPole-v1', max_episode_length=100)) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) snapshotter.snapshot_dir = './' trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 60 env.close()
def train_gru_trpo(ctxt=None): set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedTRPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def test_te_ppo(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length, is_tf_worker=True, worker_class=TaskEmbeddingWorker) algo = TEPPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, inference=self.inference, sampler=sampler, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=self.policy_ent_coeff, encoder_ent_coeff=self.encoder_ent_coeff, inference_ce_coeff=self.inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, self.env) trainer.train(n_epochs=1, batch_size=self.batch_size, plot=False)
def train_sac(ctxt=None): trainer = Trainer(ctxt) env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False).to( global_device()) qf1 = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) qf2 = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) self.algo = LoggedSAC(env=env, env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=100, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: deterministic.set_seed(1) env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 60 env.close()
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_rl2_ppo_pendulum(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: sampler = LocalSampler( agents=self.policy, envs=self.tasks.sample(self.meta_batch_size), max_episode_length=self.env_spec.max_episode_length, is_tf_worker=True, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=self.episode_per_task)) algo = RL2PPO(meta_batch_size=self.meta_batch_size, task_sampler=self.tasks, env_spec=self.env_spec, policy=self.policy, baseline=self.baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=self.episode_per_task) trainer.setup(algo, self.tasks.sample(self.meta_batch_size)) last_avg_ret = trainer.train(n_epochs=1, batch_size=self.episode_per_task * self.max_episode_length * self.meta_batch_size) assert last_avg_ret > -40
def test_train(self): with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=1, batch_size=100)
def expert_source(env, goal, max_episode_length, n_eps): expert = OptimalPolicy(env.spec, goal=goal) workers = WorkerFactory(seed=100, max_episode_length=max_episode_length) expert_sampler = LocalSampler.from_worker_factory(workers, expert, env) for _ in range(n_eps): eps_batch = expert_sampler.obtain_samples(0, max_episode_length, None) yield TimeStepBatch.from_episode_batch(eps_batch)
def __init__(self, env, max_episode_length): self.env = env self.policy = RandomPolicy(self.env.spec.action_space) self.max_episode_length = max_episode_length self.sampler = LocalSampler(agents=self.policy, envs=self.env, max_episode_length=self.max_episode_length)
def test_local_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = LocalSampler.from_worker_factory(workers, self.policy, self.env) sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker()
def setup_method(self): super().setup_method() self.meta_batch_size = 10 self.episode_per_task = 4 self.max_episode_length = 100 # Avoid pickling self max_episode_length = 100 self.tasks = task_sampler.SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: RL2Env( normalize(GymEnv(env, max_episode_length=max_episode_length)))) self.env_spec = RL2Env( normalize( GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length))).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec) self.sampler = LocalSampler( agents=self.policy, envs=self.tasks.sample(self.meta_batch_size), max_episode_length=self.env_spec.max_episode_length, is_tf_worker=True, n_workers=self.meta_batch_size, worker_class=RL2Worker)
def test_tnpg_inverted_pendulum(self): """Test TNPG with InvertedPendulum-v2 environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('InvertedPendulum-v2')) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TNPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(reg_coeff=5e-1)) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 15 env.close()
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with TFTrainer(snapshot_config) as trainer: env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler( agents=gru_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_update_envs_env_update(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) episodes = sampler.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for eps in episodes.split(): mean_rewards.append(eps.rewards.mean()) goals.append(eps.env_infos['task'][0]['goal']) assert len(mean_rewards) == 11 assert len(goals) == 11 assert np.var(mean_rewards) > 1e-2 assert np.var(goals) > 1e-2 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1))
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 10 sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CEM(env_spec=env.spec, policy=policy, sampler=sampler, best_frac=0.1, n_samples=n_samples) trainer.setup(algo, env) rtn = trainer.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def cma_es_cartpole(ctxt=None, seed=1): """Train CMA_ES with Cartpole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) n_samples = 20 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = CMAES(env_spec=env.spec, policy=policy, sampler=sampler, n_samples=n_samples) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=1000)
def train_trpo(ctxt=None): set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedTRPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, ) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts